arm_compute v17.12

commit: 8140e1e155d3430992fa46e04ef8938ff09ffd2d [log] [tgz]
author: Anthony Barbier <Anthony.barbier@arm.com> Thu Dec 14 23:48:46 2017 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Wed Jan 24 10:01:21 2018 +0000
tree: 9bcf86d01635bfc73e8debd1bda75e6f75b8b406
parent: 8a3da6f91f90c566b844d568f4ec43b946915af8 [diff]
diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 8a7f37a..81ad60b 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp

@@ -89,7 +89,7 @@
 
 bool AccessWindowStatic::update_window_if_needed(Window &window) const
 {
-    // Only update the window size if we can't use padding
+    // If the padding is not enough and the tensor is not resizable, shrink the window to size 0
     if(_info == nullptr || _info->is_resizable())
     {
         return false;
@@ -101,81 +101,65 @@
 
     bool window_modified = false;
 
-    int front_pad_y = 0;
-
-    // Adjust window start for Y dimension
+    // Calculate if padding is enough
     if(_start_y < 0)
     {
-        // Calculate rows available above the tensor
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
         if(_start_y < front_pad_y_available)
         {
-            // Not enough padding available, need to shrink the window
-            const int start = adjust_up(_start_y, front_pad_y_available, window.y().step());
-
-            window.set(1, Window::Dimension(start, window.y().end(), window.y().step()));
             window_modified = true;
         }
-
-        // Update front padding with reconstructed value
-        front_pad_y = std::max(0, -window.y().start());
     }
 
-    // Adjust window end for Y dimension
-    if(_end_y > static_cast<int>(shape[1]))
+    if(!window_modified)
     {
-        const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
-
-        // Calculate rows available below the tensor
-        const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
-
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+        if(_end_y > static_cast<int>(shape[1]))
         {
-            // Not enough padding available, need to shrink the window
-            const int end = adjust_down(_end_y, shape[1] + tail_pad_y_available, window.y().step()) + window.y().step();
-            window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
-            window_modified = true;
+            const int stride_z             = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
+            const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
+
+            if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+            {
+                window_modified = true;
+            }
+        }
+
+        if(!window_modified)
+        {
+            const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
+
+            if(_start_x < 0)
+            {
+                const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+
+                if(_start_x < front_pad_x_available)
+                {
+                    window_modified = true;
+                }
+            }
+
+            if(!window_modified && _end_x > static_cast<int>(shape[0]))
+            {
+                const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
+
+                if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+                {
+                    window_modified = true;
+                }
+            }
         }
     }
 
-    int front_pad_x = 0;
-
-    const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
-
-    // Adjust window start for X dimension
-    if(_start_x < 0)
+    // If padding is not enough
+    if(window_modified)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
-
-        if(_start_x < front_pad_x_available)
+        for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
         {
-            // Not enough padding available, need to shrink the window
-            const int start = adjust_up(_start_x, front_pad_x_available, window.x().step());
-            window.set(0, Window::Dimension(start, window.x().end(), window.x().step()));
-            window_modified = true;
-        }
-
-        // Update front padding with reconstructed value
-        front_pad_x = std::max(0, -window.x().start());
-    }
-
-    // Adjust window end for X dimension
-    if(_end_x > static_cast<int>(shape[0]))
-    {
-        const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
-
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
-        {
-            // Not enough padding available, need to shrink the window
-            const int end = adjust_down(_end_x, shape[0] + tail_pad_x_available, window.x().step()) + window.x().step();
-            window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
-            window_modified = true;
+            window.set(i, Window::Dimension(0, 0, 1));
         }
     }
 
-    window.validate();
-
     return window_modified;
 }
 

diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index b104330..4506a0b 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -201,8 +201,8 @@
     PaddingSize padding;
     padding.left   = std::max(0, -min_x);
     padding.right  = std::max<int>(0, max_x - shape[0]);
-    padding.top    = shape.num_dimensions() == 1 ? 0 : std::max(0, -min_y);
-    padding.bottom = shape.num_dimensions() == 1 ? 0 : std::max<int>(0, max_y - shape[1]);
+    padding.top    = std::max(0, -min_y);
+    padding.bottom = std::max<int>(0, max_y - shape[1]);
 
     // Update strides in tensor info
     return _info->extend_padding(padding);

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 821fb4c..54e3e52 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
 
 #include <map>
@@ -58,6 +59,13 @@
             return arm_compute::GPUTarget::MIDGARD;
     }
 }
+
+bool extension_support(const cl::Device &device, const char *extension_name)
+{
+    std::string extensions = device.getInfo<CL_DEVICE_EXTENSIONS>();
+    auto        pos        = extensions.find(extension_name);
+    return (pos != std::string::npos);
+}
 } // namespace
 
 namespace arm_compute
@@ -72,6 +80,8 @@
             return "qs8";
         case DataType::S8:
             return "char";
+        case DataType::QASYMM8:
+            return "uchar";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
@@ -105,6 +115,7 @@
         case DataType::U8:
         case DataType::QS8:
         case DataType::S8:
+        case DataType::QASYMM8:
             return "8";
         case DataType::U16:
         case DataType::S16:
@@ -177,7 +188,7 @@
 
     if(!found_mali)
     {
-        ARM_COMPUTE_INFO("Can't find valid Mali GPU. Target is set to MIDGARD.");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Mali GPU. Target is set to MIDGARD.");
         return GPUTarget::MIDGARD;
     }
 
@@ -191,7 +202,7 @@
         case 'G':
             return get_bifrost_target(version);
         default:
-            ARM_COMPUTE_INFO("Mali GPU unknown. Target is set to the default one.");
+            ARM_COMPUTE_LOG_INFO_MSG_CORE("Mali GPU unknown. Target is set to the default one.");
             return GPUTarget::MIDGARD;
     }
 }
@@ -203,21 +214,12 @@
 
 bool non_uniform_workgroup_support(const cl::Device &device)
 {
-    std::vector<char> extension;
-    size_t            extension_size = 0;
-    cl_int            err            = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, 0, nullptr, &extension_size);
-    ARM_COMPUTE_ERROR_ON_MSG((err != 0) || (extension_size == 0), "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
-    // Resize vector
-    extension.resize(extension_size);
-    // Query extension
-    err = clGetDeviceInfo(device.get(), CL_DEVICE_EXTENSIONS, extension_size, extension.data(), nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
-    ARM_COMPUTE_UNUSED(err);
+    return extension_support(device, "cl_arm_non_uniform_work_group_size");
+}
 
-    std::string extension_str(extension.begin(), extension.end());
-    auto        pos = extension_str.find("cl_arm_non_uniform_work_group_size");
-    return (pos != std::string::npos);
+bool fp16_support(const cl::Device &device)
+{
+    return extension_support(device, "cl_khr_fp16");
 }
 
 CLVersion get_cl_version(const cl::Device &device)

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6e5e802..de75518 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -35,6 +35,47 @@
 
 using namespace arm_compute;
 
+CLBuildOptions::CLBuildOptions()
+    : _build_opts()
+{
+}
+
+void CLBuildOptions::add_option(std::string option)
+{
+    _build_opts.emplace(std::move(option));
+}
+
+void CLBuildOptions::add_option_if(bool cond, std::string option)
+{
+    if(cond)
+    {
+        add_option(std::move(option));
+    }
+}
+
+void CLBuildOptions::add_option_if_else(bool cond, std::string option_true, std::string option_false)
+{
+    (cond) ? add_option(std::move(option_true)) : add_option(std::move(option_false));
+}
+
+void CLBuildOptions::add_options(const StringSet &options)
+{
+    _build_opts.insert(options.begin(), options.end());
+}
+
+void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
+{
+    if(cond)
+    {
+        add_options(options);
+    }
+}
+
+const CLBuildOptions::StringSet &CLBuildOptions::options() const
+{
+    return _build_opts;
+}
+
 Program::Program()
     : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
 {
@@ -107,6 +148,7 @@
     { "accumulate_squared", "accumulate.cl" },
     { "accumulate_weighted", "accumulate.cl" },
     { "activation_layer", "activation_layer.cl" },
+    { "activation_layer_qa8", "activation_layer_qa8.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
     { "bitwise_or", "bitwise_op.cl" },
@@ -145,6 +187,7 @@
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
     { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
+    { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" },
     { "depthwise_im2col", "depthwise_convolution.cl" },
     { "depthwise_vector_to_tensor", "depthwise_convolution.cl" },
     { "depthwise_weights_reshape", "depthwise_convolution.cl" },
@@ -157,6 +200,7 @@
     { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
     { "direct_convolution5x5", "direct_convolution5x5.cl" },
     { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
+    { "direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
     { "erode", "erode.cl" },
     { "fast_corners", "fast_corners.cl" },
     { "fill_image_borders_constant", "fill_border.cl" },
@@ -174,19 +218,27 @@
     { "gemm_ma_qs8", "gemm.cl" },
     { "gemm_ma_qs16", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
-    { "gemm_mm_interleaved_transposed_u8", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_midgard", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_qs8", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_qs16", "gemm.cl" },
     { "gemm_mm_floating_point", "gemm.cl" },
+    { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
+    { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
     { "gemm_mm_qs8", "gemm.cl" },
     { "gemm_mm_qs16", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_transpose1x16", "gemm.cl" },
     { "gemm_transpose1x8", "gemm.cl" },
     { "gemm_transpose1x4", "gemm.cl" },
+    { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
+    { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
+    { "gemmlowp_mm", "gemmlowp.cl" },
+    { "gemmlowp_mm_interleaved_transposed", "gemmlowp.cl" },
+    { "gemmlowp_offset_contribution", "gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
     { "harris_score_3x3", "harris_corners.cl" },
     { "harris_score_5x5", "harris_corners.cl" },
     { "harris_score_7x7", "harris_corners.cl" },
@@ -199,6 +251,7 @@
     { "hog_orientation_binning", "hog.cl" },
     { "hysteresis", "canny.cl" },
     { "im2col_generic", "convolution_layer.cl" },
+    { "im2col_generic_padx0_pady0", "convolution_layer.cl" },
     { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cl" },
     { "im2col_reduced", "convolution_layer.cl" },
     { "init_level", "optical_flow_pyramid_lk.cl" },
@@ -227,7 +280,7 @@
     { "non_linear_filter_disk5x5", "non_linear_filter5x5.cl" },
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
-    { "normalization_layer_in_map_1D", "normalization_layer.cl" },
+    { "normalization_layer_in_map", "normalization_layer.cl" },
     { "batchnormalization_layer", "batchnormalization_layer.cl" },
     { "NV12_to_IYUV_bt709", "color_convert.cl" },
     { "NV12_to_RGB888_bt709", "color_convert.cl" },
@@ -241,9 +294,10 @@
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pooling_layer_2", "pooling_layer.cl" },
     { "pooling_layer_3", "pooling_layer.cl" },
-    { "pooling_layer_3_optimized", "pooling_layer.cl" },
+    { "pooling_layer_optimized_3", "pooling_layer.cl" },
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_N", "pooling_layer.cl" },
+    { "pooling_layer_N_quantized", "pooling_layer_quantized.cl" },
     { "quantization_layer", "quantization_layer.cl" },
     { "reduction_operation", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
@@ -268,8 +322,13 @@
     { "sobel_separable7x1", "sobel_filter.cl" },
     { "sobel_separable1x7", "sobel_filter.cl" },
     { "softmax_layer_max", "softmax_layer.cl" },
+    { "softmax_layer_max_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_shift_exp_sum", "softmax_layer.cl" },
+    { "softmax_layer_shift_exp_sum_quantized", "softmax_layer_quantized.cl" },
     { "softmax_layer_norm", "softmax_layer.cl" },
+    { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+    { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
@@ -306,6 +365,10 @@
 #include "./cl_kernels/activation_layer.clembed"
     },
     {
+        "activation_layer_qa8.cl",
+#include "./cl_kernels/activation_layer_qa8.clembed"
+    },
+    {
         "arithmetic_op.cl",
 #include "./cl_kernels/arithmetic_op.clembed"
     },
@@ -366,6 +429,10 @@
 #include "./cl_kernels/depthwise_convolution.clembed"
     },
     {
+        "depthwise_convolution_quantized.cl",
+#include "./cl_kernels/depthwise_convolution_quantized.clembed"
+    },
+    {
         "dequantization_layer.cl",
 #include "./cl_kernels/dequantization_layer.clembed"
     },
@@ -390,6 +457,10 @@
 #include "./cl_kernels/direct_convolution5x5.clembed"
     },
     {
+        "direct_convolution_1x1_3x3_5x5_quantized.cl",
+#include "./cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.clembed"
+    },
+    {
         "erode.cl",
 #include "./cl_kernels/erode.clembed"
     },
@@ -418,6 +489,10 @@
 #include "./cl_kernels/gemm.clembed"
     },
     {
+        "gemmlowp.cl",
+#include "./cl_kernels/gemmlowp.clembed"
+    },
+    {
         "gemv.cl",
 #include "./cl_kernels/gemv.clembed"
     },
@@ -430,6 +505,10 @@
 #include "./cl_kernels/helpers.hembed"
     },
     {
+        "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+    },
+    {
         "histogram.cl",
 #include "./cl_kernels/histogram.clembed"
     },
@@ -502,6 +581,10 @@
 #include "./cl_kernels/pooling_layer.clembed"
     },
     {
+        "pooling_layer_quantized.cl",
+#include "./cl_kernels/pooling_layer_quantized.clembed"
+    },
+    {
         "quantization_layer.cl",
 #include "./cl_kernels/quantization_layer.clembed"
     },
@@ -538,6 +621,10 @@
 #include "./cl_kernels/softmax_layer.clembed"
     },
     {
+        "softmax_layer_quantized.cl",
+#include "./cl_kernels/softmax_layer_quantized.clembed"
+    },
+    {
         "tablelookup.cl",
 #include "./cl_kernels/tablelookup.clembed"
     },
@@ -591,6 +678,11 @@
 
     std::string concat_str;
 
+    if(fp16_support(_device))
+    {
+        concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+    }
+
     if(non_uniform_workgroup_support(_device))
     {
         concat_str += " -cl-arm-non-uniform-work-group-size ";

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 17b58b7..3eb94b7 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -43,15 +43,14 @@
         return;
     }
 
-    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    cl::NDRange gws = ICLKernel::gws_from_window(window);
+
+    // Check for empty NDRange
+    if(gws.dimensions() == 0)
     {
         return;
     }
 
-    cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
-                    (window.y().end() - window.y().start()) / window.y().step(),
-                    (window.z().end() - window.z().start()) / window.z().step());
-
     cl::NDRange valid_lws;
     if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
@@ -182,3 +181,17 @@
     }
     return _max_workgroup_size;
 }
+
+cl::NDRange ICLKernel::gws_from_window(const Window &window)
+{
+    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    {
+        return cl::NullRange;
+    }
+
+    cl::NDRange gws((window.x().end() - window.x().start()) / window.x().step(),
+                    (window.y().end() - window.y().start()) / window.y().step(),
+                    (window.z().end() - window.z().start()) / window.z().step());
+
+    return gws;
+}

diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index 7b0d011..0bd9d15 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp

@@ -41,7 +41,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 287c5e2..157b6d6 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -71,39 +71,46 @@
         return false;
     }
 
-    clBuildProgram            = reinterpret_cast<clBuildProgram_func>(dlsym(handle, "clBuildProgram"));
-    clEnqueueNDRangeKernel    = reinterpret_cast<clEnqueueNDRangeKernel_func>(dlsym(handle, "clEnqueueNDRangeKernel"));
-    clSetKernelArg            = reinterpret_cast<clSetKernelArg_func>(dlsym(handle, "clSetKernelArg"));
-    clReleaseKernel           = reinterpret_cast<clReleaseKernel_func>(dlsym(handle, "clReleaseKernel"));
-    clCreateProgramWithSource = reinterpret_cast<clCreateProgramWithSource_func>(dlsym(handle, "clCreateProgramWithSource"));
-    clCreateBuffer            = reinterpret_cast<clCreateBuffer_func>(dlsym(handle, "clCreateBuffer"));
-    clRetainKernel            = reinterpret_cast<clRetainKernel_func>(dlsym(handle, "clRetainKernel"));
-    clCreateKernel            = reinterpret_cast<clCreateKernel_func>(dlsym(handle, "clCreateKernel"));
-    clGetProgramInfo          = reinterpret_cast<clGetProgramInfo_func>(dlsym(handle, "clGetProgramInfo"));
-    clFlush                   = reinterpret_cast<clFlush_func>(dlsym(handle, "clFlush"));
-    clFinish                  = reinterpret_cast<clFinish_func>(dlsym(handle, "clFinish"));
-    clReleaseProgram          = reinterpret_cast<clReleaseProgram_func>(dlsym(handle, "clReleaseProgram"));
-    clRetainContext           = reinterpret_cast<clRetainContext_func>(dlsym(handle, "clRetainContext"));
-    clCreateProgramWithBinary = reinterpret_cast<clCreateProgramWithBinary_func>(dlsym(handle, "clCreateProgramWithBinary"));
-    clReleaseCommandQueue     = reinterpret_cast<clReleaseCommandQueue_func>(dlsym(handle, "clReleaseCommandQueue"));
-    clEnqueueMapBuffer        = reinterpret_cast<clEnqueueMapBuffer_func>(dlsym(handle, "clEnqueueMapBuffer"));
-    clRetainProgram           = reinterpret_cast<clRetainProgram_func>(dlsym(handle, "clRetainProgram"));
-    clGetProgramBuildInfo     = reinterpret_cast<clGetProgramBuildInfo_func>(dlsym(handle, "clGetProgramBuildInfo"));
-    clEnqueueReadBuffer       = reinterpret_cast<clEnqueueReadBuffer_func>(dlsym(handle, "clEnqueueReadBuffer"));
-    clEnqueueWriteBuffer      = reinterpret_cast<clEnqueueWriteBuffer_func>(dlsym(handle, "clEnqueueWriteBuffer"));
-    clReleaseEvent            = reinterpret_cast<clReleaseEvent_func>(dlsym(handle, "clReleaseEvent"));
-    clReleaseContext          = reinterpret_cast<clReleaseContext_func>(dlsym(handle, "clReleaseContext"));
-    clRetainCommandQueue      = reinterpret_cast<clRetainCommandQueue_func>(dlsym(handle, "clRetainCommandQueue"));
-    clEnqueueUnmapMemObject   = reinterpret_cast<clEnqueueUnmapMemObject_func>(dlsym(handle, "clEnqueueUnmapMemObject"));
-    clRetainMemObject         = reinterpret_cast<clRetainMemObject_func>(dlsym(handle, "clRetainMemObject"));
-    clReleaseMemObject        = reinterpret_cast<clReleaseMemObject_func>(dlsym(handle, "clReleaseMemObject"));
-    clGetDeviceInfo           = reinterpret_cast<clGetDeviceInfo_func>(dlsym(handle, "clGetDeviceInfo"));
-    clGetDeviceIDs            = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
-    clRetainEvent             = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
-    clGetPlatformIDs          = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs"));
-    clGetKernelWorkGroupInfo  = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo"));
+#define LOAD_FUNCTION_PTR(func_name, handle) \
+    func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
 
-    dlclose(handle);
+    LOAD_FUNCTION_PTR(clBuildProgram, handle);
+    LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel, handle);
+    LOAD_FUNCTION_PTR(clBuildProgram, handle);
+    LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel, handle);
+    LOAD_FUNCTION_PTR(clSetKernelArg, handle);
+    LOAD_FUNCTION_PTR(clReleaseKernel, handle);
+    LOAD_FUNCTION_PTR(clCreateProgramWithSource, handle);
+    LOAD_FUNCTION_PTR(clCreateBuffer, handle);
+    LOAD_FUNCTION_PTR(clRetainKernel, handle);
+    LOAD_FUNCTION_PTR(clCreateKernel, handle);
+    LOAD_FUNCTION_PTR(clGetProgramInfo, handle);
+    LOAD_FUNCTION_PTR(clFlush, handle);
+    LOAD_FUNCTION_PTR(clFinish, handle);
+    LOAD_FUNCTION_PTR(clReleaseProgram, handle);
+    LOAD_FUNCTION_PTR(clRetainContext, handle);
+    LOAD_FUNCTION_PTR(clCreateProgramWithBinary, handle);
+    LOAD_FUNCTION_PTR(clReleaseCommandQueue, handle);
+    LOAD_FUNCTION_PTR(clEnqueueMapBuffer, handle);
+    LOAD_FUNCTION_PTR(clRetainProgram, handle);
+    LOAD_FUNCTION_PTR(clGetProgramBuildInfo, handle);
+    LOAD_FUNCTION_PTR(clEnqueueReadBuffer, handle);
+    LOAD_FUNCTION_PTR(clEnqueueWriteBuffer, handle);
+    LOAD_FUNCTION_PTR(clReleaseEvent, handle);
+    LOAD_FUNCTION_PTR(clReleaseContext, handle);
+    LOAD_FUNCTION_PTR(clRetainCommandQueue, handle);
+    LOAD_FUNCTION_PTR(clEnqueueUnmapMemObject, handle);
+    LOAD_FUNCTION_PTR(clRetainMemObject, handle);
+    LOAD_FUNCTION_PTR(clReleaseMemObject, handle);
+    LOAD_FUNCTION_PTR(clGetDeviceInfo, handle);
+    LOAD_FUNCTION_PTR(clGetDeviceIDs, handle);
+    LOAD_FUNCTION_PTR(clRetainEvent, handle);
+    LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
+    LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
+
+#undef LOAD_FUNCTION_PTR
+
+    //Don't call dlclose(handle) or all the symbols will be unloaded !
 
     // Disable default loading and set status to successful
     _loaded = std::make_pair(true, true);
@@ -114,7 +121,7 @@
 bool opencl_is_available()
 {
     CLSymbols::get().load_default();
-    return CLSymbols::get().clBuildProgram != nullptr;
+    return CLSymbols::get().clBuildProgram_ptr != nullptr;
 }
 } // namespace arm_compute
 
@@ -127,7 +134,7 @@
     void *user_data)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clBuildProgram;
+    auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
     if(func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
@@ -150,7 +157,7 @@
     cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel;
+    auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
     if(func != nullptr)
     {
         return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
@@ -168,7 +175,7 @@
     const void *arg_value)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clSetKernelArg;
+    auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
     if(func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
@@ -182,7 +189,7 @@
 cl_int clRetainMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainMemObject;
+    auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
     if(func != nullptr)
     {
         return func(memobj);
@@ -196,7 +203,7 @@
 cl_int clReleaseMemObject(cl_mem memobj)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseMemObject;
+    auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
     if(func != nullptr)
     {
         return func(memobj);
@@ -216,7 +223,7 @@
     cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject;
+    auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
     if(func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
@@ -230,7 +237,7 @@
 cl_int clRetainCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainCommandQueue;
+    auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -244,7 +251,7 @@
 cl_int clReleaseContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseContext;
+    auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
     if(func != nullptr)
     {
         return func(context);
@@ -257,7 +264,7 @@
 cl_int clReleaseEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseEvent;
+    auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
     if(func != nullptr)
     {
         return func(event);
@@ -280,7 +287,7 @@
     cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer;
+    auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -303,7 +310,7 @@
     cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer;
+    auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
@@ -323,7 +330,7 @@
     size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo;
+    auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
     if(func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -337,7 +344,7 @@
 cl_int clRetainProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainProgram;
+    auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
     if(func != nullptr)
     {
         return func(program);
@@ -361,7 +368,7 @@
     cl_int          *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer;
+    auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
     if(func != nullptr)
     {
         return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
@@ -379,7 +386,7 @@
 cl_int clReleaseCommandQueue(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue;
+    auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -400,7 +407,7 @@
     cl_int               *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary;
+    auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
     if(func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
@@ -418,7 +425,7 @@
 cl_int clRetainContext(cl_context context)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainContext;
+    auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
     if(func != nullptr)
     {
         return func(context);
@@ -432,7 +439,7 @@
 cl_int clReleaseProgram(cl_program program)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseProgram;
+    auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
     if(func != nullptr)
     {
         return func(program);
@@ -446,7 +453,7 @@
 cl_int clFlush(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clFlush;
+    auto func = arm_compute::CLSymbols::get().clFlush_ptr;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -460,7 +467,7 @@
 cl_int clFinish(cl_command_queue command_queue)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clFinish;
+    auto func = arm_compute::CLSymbols::get().clFinish_ptr;
     if(func != nullptr)
     {
         return func(command_queue);
@@ -479,7 +486,7 @@
     size_t         *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetProgramInfo;
+    auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
     if(func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
@@ -496,7 +503,7 @@
     cl_int     *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clCreateKernel;
+    auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
     if(func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
@@ -514,7 +521,7 @@
 cl_int clRetainKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainKernel;
+    auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
     if(func != nullptr)
     {
         return func(kernel);
@@ -533,7 +540,7 @@
     cl_int      *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clCreateBuffer;
+    auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
     if(func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
@@ -556,7 +563,7 @@
     cl_int       *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource;
+    auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
     if(func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
@@ -574,7 +581,7 @@
 cl_int clReleaseKernel(cl_kernel kernel)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clReleaseKernel;
+    auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
     if(func != nullptr)
     {
         return func(kernel);
@@ -592,7 +599,7 @@
                       cl_uint       *num_devices)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetDeviceIDs;
+    auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
     if(func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
@@ -610,7 +617,7 @@
                        size_t        *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetDeviceInfo;
+    auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
     if(func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
@@ -624,7 +631,7 @@
 cl_int clRetainEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clRetainEvent;
+    auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
     if(func != nullptr)
     {
         return func(event);
@@ -638,7 +645,7 @@
 cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetPlatformIDs;
+    auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
     if(func != nullptr)
     {
         return func(num_entries, platforms, num_platforms);
@@ -658,7 +665,7 @@
                          size_t                   *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
-    auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo;
+    auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
     if(func != nullptr)
     {
         return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);

diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
new file mode 100644
index 0000000..910a93f
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max(0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+/** This performs an activation function on QASYMM8 inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer_qa8(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    // Perform activation
+    data = ACTIVATION_OP(ACT, data);
+
+    // requantize to output space
+    float16 fdata = convert_float16(data);
+    fdata         = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
+    uchar16 qdata = convert_uchar16_sat(fdata);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (qdata, 0, (__global DATA_TYPE *)output.ptr);
+}

diff --git a/src/core/CL/cl_kernels/asymm_helper.h b/src/core/CL/cl_kernels/asymm_helper.h
new file mode 100644
index 0000000..18c1475
--- /dev/null
+++ b/src/core/CL/cl_kernels/asymm_helper.h

@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ASYMM_HELPER_H
+#define ARM_COMPUTE_ASYMM_HELPER_H
+
+// Algoriths for these functions were taken from
+// https://github.com/google/gemmlowp/blob/master/fixedpoint/fixedpoint.h
+// and adapted to operate on integer vectors.
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is zero.
+ *
+ * @param[in] a Input vector whose zero bits define which corresponding bits in result will be set.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is zero.
+ */
+inline int16 asymm_mask_if_zero(int16 a)
+{
+    const int16 all_zeros = 0;
+    const int16 all_ones  = ~0;
+    return select(all_zeros, all_ones, a == 0);
+}
+
+/** For each element of input vector, the corresponding bits of the result item are set
+ * if the input item is non-zero.
+ *
+ * @param[in] a Input vector whose non-zero bits define which corresponding bits in result will be set.
+ *
+ * @returns Output vector with bits set when corresponding bit in @p a is non zero.
+ */
+inline int16 asymm_mask_if_non_zero(int16 a)
+{
+    const int16 all_zeros = 0;
+    const int16 all_ones  = ~0;
+    return select(all_zeros, all_ones, a != 0);
+}
+
+/** Each bit of the result is set to the corresponding bit of either then_val or
+ * else_val depending on whether the corresponding bit of if_mask is set.
+ * Equivalent to the VBSL instruction in ARM NEON.
+ *
+ * @param[in] if_mask  Mask defines will bit be taken from @p then_val or @p else_val depending on corresponding bit in mask is set or not.
+ * @param[in] then_val Value whose bit will be used for result when corresponding bit in @p if_mask is set.
+ * @param[in] else_val Value whose bit will be used for result when corresponding bit in @p if_mask is not set.
+ *
+ * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
+ */
+inline int16 asymm_select_using_mask(int16 if_mask, int16 then_val, int16 else_val)
+{
+    return (if_mask & then_val) ^ (~if_mask & else_val);
+}
+
+/** Correctly rounded to nearest division by a power of two.
+ * Also known as a rounding arithmetic right shift.
+ *
+ * @param[in] x        Value needed to be divided by power of two.
+ * @param[in] exponent Power of two, must be positive number.
+ *
+ * @return Arithmetic right shift.
+ */
+inline int16 asymm_rounding_divide_by_pow2(int16 x, int exponent)
+{
+    int16       mask      = (1 << exponent) - 1;
+    const int16 zero      = 0;
+    const int16 one       = 1;
+    int16       threshold = (mask >> 1) + select(zero, one, x < 0);
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold);
+}
+
+/** Calculates the product of a integer value by a power of two, with either a positive exponent
+ * (equivalent to an arithmetic left shift, saturating) or a negative exponent
+ * (equivalent to an arithmetic right shift, rounding to nearest).
+ *
+ * @param[in] x        Value needed to be multiplied or divided by power of two depending on sign of @p exponent.
+ * @param[in] exponent Power of two, can be positive or negative number.
+ *
+ * @return Arithmetic left or right shift.
+ */
+inline int16 asymm_saturating_rounding_mult_by_pow2(int16 x, int exponent)
+{
+    if(exponent < 0)
+    {
+        return asymm_rounding_divide_by_pow2(x, -exponent);
+    }
+
+    const int16 min           = INT_MIN;
+    const int16 max           = INT_MAX;
+    int         threshold     = ((1 << (31 - exponent)) - 1);
+    int16       positive_mask = asymm_mask_if_non_zero(x > threshold);
+    int16       negative_mask = asymm_mask_if_non_zero(x < -threshold);
+    int16       result        = x << exponent;
+    result                    = asymm_select_using_mask(positive_mask, max, result);
+    result                    = asymm_select_using_mask(negative_mask, min, result);
+    return result;
+}
+
+/** Calculates (a+b)/2, rounded to the nearest integer.
+ * Equivalent to VRHADD in the ARM NEON instruction set.
+ *
+ * @param[in] a First term of half-sum.
+ * @param[in] b Second term of half-sum.
+ *
+ * @return (a+b)/2, rounded to the nearest integer.
+ */
+inline int16 asymm_rounding_half_sum(int16 a, int16 b)
+{
+    long16       a64       = convert_long16(a);
+    long16       b64       = convert_long16(b);
+    long16       sum       = a64 + b64;
+    const long16 one       = 1;
+    const long16 minus_one = -1;
+    long16       sign      = select(minus_one, one, sum >= 0);
+    return convert_int16((sum + sign) / 2);
+}
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ * This is equivalent to the VQRDMULH instruction in ARM NEON.
+ *
+ * @param[in] a First term of product.
+ * @param[in] b Second term of product.
+ *
+ * @return Product of two numbers.
+ */
+inline int16 asymm_saturating_rounding_doubling_high_mul(int16 a, int16 b)
+{
+    int16  overflow     = (a == b) && (a == INT_MIN);
+    long16 a_64         = convert_long16(a);
+    long16 b_64         = convert_long16(b);
+    long16 ab_64        = a_64 * b_64;
+    long16 mask1        = 1 << 30;
+    long16 mask2        = 1 - (1 << 30);
+    long16 nudge        = select(mask2, mask1, ab_64 >= 0);
+    long16 mask         = 1ll << 31;
+    int16  ab_x2_high32 = convert_int16((ab_64 + nudge) / mask);
+    return select(ab_x2_high32, INT_MAX, overflow);
+}
+
+/** Fixed-point multiplication.
+ *
+ * @param[in] a Argument 1 in fixed-point format Q(a).
+ * @param[in] b Argument 2 in fixed-point format Q(b).
+ *
+ * @return Result in fixed-point format Q(a+b).
+ */
+inline int16 asymm_mult(int16 a, int16 b)
+{
+    return asymm_saturating_rounding_doubling_high_mul(a, b);
+}
+
+/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0).
+ *
+ * @param[in] a Argument in fixed-point format Q0.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(int16 a)
+{
+    const int16 constant_term                            = 1895147668;
+    const int16 constant_1_over_3                        = 715827883;
+    const int   k_fractional_bits                        = 31;
+    int16       x                                        = a + (1 << (k_fractional_bits - 3));
+    int16       x2                                       = asymm_mult(x, x);
+    int16       x3                                       = asymm_mult(x2, x);
+    int16       x4                                       = asymm_mult(x2, x2);
+    int16       x4_over_4                                = asymm_rounding_divide_by_pow2(x4, 2);
+    int16       x4_over_24_plus_x3_over_6_plus_x2        = asymm_mult((x4_over_4 + x3), constant_1_over_3) + x2;
+    int16       x4_over_24_plus_x3_over_6_plus_x2_over_2 = asymm_rounding_divide_by_pow2(x4_over_24_plus_x3_over_6_plus_x2, 1);
+    return constant_term + asymm_mult(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2);
+}
+
+/** Calculates \f$ exp(x) \f$ for x < 0.
+ *
+ * @param[in] a              Argument in fixed-point format Q(k_integer_bits).
+ * @param[in] k_integer_bits Number of integer bit in argument.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_exp_on_negative_values(int16 a, int k_integer_bits)
+{
+    const int k_fractional_bits                      = 31 - k_integer_bits;
+    int16     k_one_quarter                          = 1 << (k_fractional_bits - 2);
+    int16     mask                                   = k_one_quarter - 1;
+    int16     a_mod_quarter_minus_one_quarter        = (a & mask) - k_one_quarter;
+    int16     a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;
+    int16     result                                 = asymm_exp_on_interval_between_negative_one_quarter_and_0_excl(a_mod_quarter_minus_one_quarter_scaled);
+    int16     remainder                              = a_mod_quarter_minus_one_quarter - a;
+
+#define EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier)                                       \
+    if(k_integer_bits > Exponent)                                                                \
+    {                                                                                            \
+        const int k_shift_amount = k_integer_bits > Exponent ? k_fractional_bits + Exponent : 0; \
+        result                   = asymm_select_using_mask(                                      \
+                                                                                                 asymm_mask_if_non_zero(remainder & (1 << k_shift_amount)),                           \
+                                                                                                 asymm_mult(result, FixedPointMultiplier), result);                                   \
+    }
+    EXP_BARREL_SHIFTER(-2, 1672461947);
+    EXP_BARREL_SHIFTER(-1, 1302514674);
+    EXP_BARREL_SHIFTER(+0, 790015084);
+    EXP_BARREL_SHIFTER(+1, 290630308);
+    EXP_BARREL_SHIFTER(+2, 39332535);
+    EXP_BARREL_SHIFTER(+3, 720401);
+    EXP_BARREL_SHIFTER(+4, 242);
+#undef EXP_BARREL_SHIFTER
+
+    if(k_integer_bits > 5)
+    {
+        const int16 clamp = -(1 << (k_fractional_bits + 5));
+        result            = asymm_select_using_mask(asymm_mask_if_non_zero(a < clamp), 0, result);
+    }
+
+    const int16 Q0_one = INT_MAX;
+    return asymm_select_using_mask(asymm_mask_if_zero(a), Q0_one, result);
+}
+
+/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
+ *
+ * @param[in] a Argument in fixed-point format Q0.
+ *
+ * @return Result in fixed-point format Q0.
+ */
+inline int16 asymm_one_over_one_plus_x_for_x_in_0_1(int16 a)
+{
+    const int16 Q0_one            = INT_MAX;
+    const int16 Q2_one            = 1 << (31 - 2);
+    int16       half_denominator  = asymm_rounding_half_sum(a, Q0_one);
+    const int16 Q2_48_over_17     = 1515870810;
+    const int16 Q2_neg_32_over_17 = -1010580540;
+    int16       x                 = Q2_48_over_17 + asymm_mult(half_denominator, Q2_neg_32_over_17);
+    for(int i = 0; i < 3; i++)
+    {
+        int16 half_denominator_times_x           = asymm_mult(half_denominator, x);
+        int16 one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x;
+        int16 tmp                                = asymm_mult(x, one_minus_half_denominator_times_x);
+        x                                        = x + asymm_saturating_rounding_mult_by_pow2(tmp, 2);
+    }
+    return asymm_saturating_rounding_mult_by_pow2(x, 1);
+}
+
+/** Considering the integer value as fixed-point, change the number of integer bits and update value accordingly.
+ *
+ * @param[in] value            Value to be rescaled.
+ * @param[in] src_integer_bits Old number of integer bits.
+ * @param[in] dst_integer_bits New number of integer bits.
+ *
+ * @return Rescaled value.
+ */
+inline int16 asymm_rescale(int16 value, int src_integer_bits, int dst_integer_bits)
+{
+    int exponent = src_integer_bits - dst_integer_bits;
+    return asymm_saturating_rounding_mult_by_pow2(value, exponent);
+}
+
+#endif // ARM_COMPUTE_ASYMM_HELPER_H

diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
index ec67192..94ad53c 100644
--- a/src/core/CL/cl_kernels/canny.cl
+++ b/src/core/CL/cl_kernels/canny.cl

@@ -226,19 +226,19 @@
 
 #define EDGE 255
 #define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
-#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate
 
 /** Check whether pixel is valid
-*
-* Skip the pixel if the early_test fails.
-* Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
-*
-* @param[in] early_test Boolean condition based on the minv check and visited buffer check
-* @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
-* @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
-* @param[in] x_cur      X-coordinate of current central pixel
-* @param[in] y_cur      Y-coordinate of current central pixel
-*/
+ *
+ * Skip the pixel if the early_test fails.
+ * Otherwise, it tries to add the pixel coordinate to the stack, and proceed to popping the stack instead if the stack is full
+ *
+ * @param[in] early_test Boolean condition based on the minv check and visited buffer check
+ * @param[in] x_pos      X-coordinate of pixel that is going to be recorded, has to be within the boundary
+ * @param[in] y_pos      Y-coordinate of pixel that is going to be recorded, has to be within the boundary
+ * @param[in] x_cur      X-coordinate of current central pixel
+ * @param[in] y_cur      Y-coordinate of current central pixel
+ */
 #define check_pixel(early_test, x_pos, y_pos, x_cur, y_cur)                               \
     {                                                                                     \
         if(!early_test)                                                                   \

diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 9e9d0b0..77b9b64 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl

@@ -97,13 +97,14 @@
     }
 }
 
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PAD_VALUE)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The value to use for the paddings must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -117,27 +118,25 @@
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  filter_depth                      The depth of the used filter
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_generic(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-    uint filter_depth,
     uint src_stride_w,
     uint dst_stride_w)
 {
     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % filter_depth; // input feature map
-    const int batch = get_global_id(2) / filter_depth; // the batch
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 
-    // Calculate input indeces
-    const int xi = xc * STRIDE_X - PAD_X;
-    const int yi = yc * STRIDE_Y - PAD_Y;
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
 
-    // Calculate output indeces
+    // Calculate output indices
     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
@@ -149,18 +148,18 @@
     {
         for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
         {
-#if PAD_X == 0 && PAD_Y == 0
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else  // PAD_X == 0 && PAD_Y == 0
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
             {
-                *output_ptr = 0;
+                *output_ptr = PAD_VALUE;
             }
             else
             {
                 *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
             }
-#endif // PAD_X == 0 && PAD_Y == 0
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
         }
     }
 
@@ -181,7 +180,7 @@
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -195,27 +194,25 @@
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  filter_depth                      The depth of the used filter
  * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
  */
 __kernel void im2col_kernel3x3_padx0_pady0(
     TENSOR3D_DECLARATION(src),
     IMAGE_DECLARATION(dst),
-    uint filter_depth,
     uint src_stride_w,
     uint dst_stride_w)
 {
     const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
     const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % filter_depth; // input feature map
-    const int batch = get_global_id(2) / filter_depth; // the batch
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
 
-    // Calculate input indeces
+    // Calculate input indices
     const int xi = xc * STRIDE_X;
     const int yi = yc * STRIDE_Y;
 
-    // Calculate output indeces
+    // Calculate output indices
     const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
     const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
 
@@ -245,14 +242,14 @@
     }
 #endif // HAS_BIAS
 }
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
 
 #if defined(WIDTH_OUTPUT)
 /** This kernel performs a reshaping of the output of the convolution layer.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -291,7 +288,7 @@
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
  * @note In case biases will be added in late stage, -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -332,3 +329,86 @@
     }
 #endif // HAS_BIAS
 }
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This kernel reshapes the input tensor to a tensor used to perform convolution using GEMM when
+ * the kernel width is greater than 1 (except when the kernel size is 3x3) and pad_x == pad_y == 0.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);                // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);                // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % KERNEL_DEPTH; // input feature map
+    const int batch = get_global_id(2) / KERNEL_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+    // Calculate output indices
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int yo                   = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+    __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+    if(ch == (KERNEL_DEPTH - 1))
+    {
+#ifdef FIXED_POINT_POSITION
+        *output_ptr = (DATA_TYPE)(1 << FIXED_POINT_POSITION);
+#else  // FIXED_POINT_POSITION
+        *output_ptr       = 1.0f;
+#endif // FIXED_POINT_POSITION
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(KERNEL_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 9c2c3a5..89555a0 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -145,38 +145,53 @@
 }
 
 /** This function computes the horizontal integral of the image.
-  *
-  * @param[in] src_ptr                               Pointer to the source image. Supported data types: U8
-  * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
-  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
-  * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
-  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
-  * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
-  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
-  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
-  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F16/F32
-  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
-  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
-  * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
-  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
-  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
-  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
-  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
-  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
-  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
-  * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
-  * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
-  * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
-  * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
-  * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
-  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
-  */
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: U8
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ */
 
-__kernel void depthwise_convolution_3x3(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst), TENSOR3D_DECLARATION(weights))
+__kernel void depthwise_convolution_3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif //defined(HAS_BIAS)
+)
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif //defined(HAS_BIAS)
 
     uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
     float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
@@ -186,6 +201,9 @@
     float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                    weights_values2.s0, weights_values2.s1, weights_values2.s2);
+#if defined(HAS_BIAS)
+    pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+#endif //defined(HAS_BIAS)
 
     vstore2(pixels, 0, (__global float *)dst.ptr);
 }
@@ -197,24 +215,38 @@
  *
  * @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
  */
-__kernel void depthwise_weights_reshape(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+__kernel void depthwise_weights_reshape(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst)
+#ifdef HAS_BIAS
+    ,
+    VECTOR_DECLARATION(biases)
+#endif /* HAS_BIAS */
+)
 {
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* HAS_BIAS */
 
     __global DATA_TYPE *input_ptr = (__global DATA_TYPE *)src.ptr;
     __global uchar *output_ptr    = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;
@@ -223,14 +255,21 @@
     {
         *((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;
     }
+
+#if defined(HAS_BIAS)
+    if(get_global_id(1) == 0)
+    {
+        *((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));
+    }
+#endif // defined(HAS_BIAS)
 }
 #endif //defined(SRC_WIDTH) && defined(DATA_TYPE)
 
-#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
 /** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_X, -DPAD_Y, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
+ * @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -255,11 +294,11 @@
     Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
 
     const int src_pixel_linear = get_global_id(1) * STRIDE_X;
-    const int full_length      = SRC_WIDTH + 2 * PAD_X;
+    const int full_length      = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;
     const int max_initial_x    = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);
 
-    const int src_x = -PAD_X + src_pixel_linear % max_initial_x;
-    const int src_y = -PAD_Y + src_pixel_linear / max_initial_x * STRIDE_Y;
+    const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;
+    const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;
     const int src_z = get_global_id(2);
 
     __global uchar *input_ptr      = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;
@@ -279,9 +318,12 @@
             }
         }
     }
+#if defined(HAS_BIAS)
+    *output_ptr = (DATA_TYPE)(1);
+#endif // defined(HAS_BIAS)
 }
 
-#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_X) && defined(PAD_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
+#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE)
 
 #if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
new file mode 100644
index 0000000..0cd4e71
--- /dev/null
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers_asymm.h"
+
+#if defined(CONV_STRIDE_X)
+
+#if CONV_STRIDE_X == 1
+#define convolution1x3 convolution1x3_stride_1
+#elif CONV_STRIDE_X == 2
+#define convolution1x3 convolution1x3_stride_2
+#elif CONV_STRIDE_X == 3
+#define convolution1x3 convolution1x3_stride_3
+#else /* CONV_STRIDE_X */
+#error "Stride not supported"
+#endif /* CONV_STRIDE_X */
+
+/** Compute a 1D horizontal convolution of size 3 and stride 1 for uchar type.
+ *
+ * @param[in] left_pixel    Pointer to the left pixel.
+ * @param[in] left_coeff    Weight of the left pixel
+ * @param[in] middle_coeff  Weight of the middle pixel
+ * @param[in] right_coeff   Weight of the right pixel
+ * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
+ * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
+ *
+ * @return a int2 containing 2 convoluted values.
+ */
+inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
+                                    const int             left_coeff,
+                                    const int             middle_coeff,
+                                    const int             right_coeff,
+                                    const int             input_offset,
+                                    const int             weight_offset)
+{
+    int4 temp = CONVERT(vload4(0, left_pixel), int4);
+
+    int2 left   = CONVERT(temp.s01, int2);
+    int2 middle = CONVERT(temp.s12, int2);
+    int2 right  = CONVERT(temp.s23, int2);
+
+    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
+ *
+ * @param[in] left_pixel    Pointer to the left pixel.
+ * @param[in] left_coeff    Weight of the left pixel
+ * @param[in] middle_coeff  Weight of the middle pixel
+ * @param[in] right_coeff   Weight of the right pixel
+ * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
+ * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
+ *
+ * @return a int2 containing 2 convoluted values.
+ */
+inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
+                                    const int             left_coeff,
+                                    const int             middle_coeff,
+                                    const int             right_coeff,
+                                    const int             input_offset,
+                                    const int             weight_offset)
+{
+    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
+    int  temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int);
+
+    int2 left   = CONVERT(temp0.s02, int2);
+    int2 middle = CONVERT(temp0.s13, int2);
+    int2 right  = CONVERT((int2)(temp0.s2, temp1), int2);
+
+    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+}
+
+/** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
+ *
+ * @param[in] left_pixel    Pointer to the left pixel.
+ * @param[in] left_coeff    Weight of the left pixel
+ * @param[in] middle_coeff  Weight of the middle pixel
+ * @param[in] right_coeff   Weight of the right pixel
+ * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
+ * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
+ *
+ * @return a int2 containing 2 convoluted values.
+ */
+inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
+                                    const int             left_coeff,
+                                    const int             middle_coeff,
+                                    const int             right_coeff,
+                                    const int             input_offset,
+                                    const int             weight_offset)
+{
+    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
+    int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2);
+
+    int2 left   = CONVERT(temp0.s03, int2);
+    int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2);
+    int2 right  = CONVERT((int2)(temp0.s2, temp1.s1), int2);
+
+    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+}
+
+/** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
+ *
+ * Convolution matrix layout:
+ *
+ * [ mat0, mat1, mat2 ]\n
+ * [ mat3, mat4, mat5 ]\n
+ * [ mat6, mat7, mat8 ]\n
+ *
+ * @param[in] src               A pointer to source Image structure
+ * @param[in] mat0              Coefficient from the convolution matrix
+ * @param[in] mat1              Coefficient from the convolution matrix
+ * @param[in] mat2              Coefficient from the convolution matrix
+ * @param[in] mat3              Coefficient from the convolution matrix
+ * @param[in] mat4              Coefficient from the convolution matrix
+ * @param[in] mat5              Coefficient from the convolution matrix
+ * @param[in] mat6              Coefficient from the convolution matrix
+ * @param[in] mat7              Coefficient from the convolution matrix
+ * @param[in] mat8              Coefficient from the convolution matrix
+ * @param[in] input_offset      Quantized offset of zero point of the input tensor data range
+ * @param[in] weight_offset     Quantized offset of zero point of the weights tensor data range
+ * @param[in] output_offset     Quantized offset of zero point of the output tensor data range
+ * @param[in] output_multiplier Output scale multiplier
+ * @param[in] output_shift      Output scale divisor exponent
+ * @param[in] bias              (Optional) Bias value
+ *
+ * @return a uchar2 containing 2 convoluted values.
+ */
+inline uchar2 convolution3x3(
+    Image      *src,
+    const uchar mat0, const uchar mat1, const uchar mat2,
+    const uchar mat3, const uchar mat4, const uchar mat5,
+    const uchar mat6, const uchar mat7, const uchar mat8,
+    const int input_offset, const int weight_offset, const int output_offset,
+    const int output_multiplier, const int output_shift
+#if defined(HAS_BIAS)
+    ,
+    const int bias
+#endif //defined(HAS_BIAS)
+)
+{
+    int2 pixels;
+
+    pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
+    pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
+    pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
+#if defined(HAS_BIAS)
+    pixels += (int2)(bias);
+#endif //defined(HAS_BIAS)
+
+    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2);
+    pixels = pixels + output_offset;
+    pixels = clamp(pixels, 0, 255);
+
+    return CONVERT(pixels, uchar2);
+}
+
+/** This function computes the horizontal integral of the image.
+ *
+ * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
+ * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] weights_step_z                        weights_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases vector. Supported data types: QASYMM8
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ * @param[in] input_offset                          Quantized offset of zero point of the input tensor data range
+ * @param[in] weight_offset                         Quantized offset of zero point of the weights tensor data range
+ * @param[in] output_offset                         Quantized offset of zero point of the output tensor data range
+ * @param[in] output_multiplier                     Output scale multiplier
+ * @param[in] output_shift                          Output scale divisor exponent
+ */
+
+__kernel void depthwise_convolution_3x3_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(biases),
+#endif //defined(HAS_BIAS)
+    int input_offset,
+    int weight_offset,
+    int output_offset,
+    int output_multiplier,
+    int output_shift)
+{
+    Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+#if defined(HAS_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif //defined(HAS_BIAS)
+
+    uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
+    uchar3 weights_values0 = vload3(0, weights.ptr + offset.s0);
+    uchar3 weights_values1 = vload3(0, weights.ptr + offset.s1);
+    uchar3 weights_values2 = vload3(0, weights.ptr + offset.s2);
+
+#if defined(HAS_BIAS)
+    int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
+#endif //defined(HAS_BIAS)
+
+    uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+                                   weights_values1.s0, weights_values1.s1, weights_values1.s2,
+                                   weights_values2.s0, weights_values2.s1, weights_values2.s2,
+                                   input_offset, weight_offset, output_offset,
+                                   output_multiplier, output_shift
+#if defined(HAS_BIAS)
+                                   ,
+                                   bias_value
+#endif //defined(HAS_BIAS)
+                                  );
+
+    vstore2(pixels, 0, dst.ptr);
+}
+
+#endif //defined(CONV_STRIDE_X)

diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index 7b73b85..817c261 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl

@@ -153,7 +153,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -241,7 +241,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index 1420d7c..a7abc9f 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl

@@ -102,7 +102,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -198,7 +198,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 6fdd019..e678f6f 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl

@@ -91,7 +91,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
@@ -197,7 +197,7 @@
  * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
new file mode 100644
index 0000000..d0cf032
--- /dev/null
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl

@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if KERNEL_SIZE == 5
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
+        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
+        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                             \
+        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s345, src0.s67, src1.s012) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + input_offset) * ((int8)weights_value1 + weight_offset);     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                            \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                 \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                              \
+        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                           \
+        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                              \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);         \
+        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + input_offset) * ((int8)weights_values0.s3 + weight_offset); \
+        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + input_offset) * ((int8)weights_value1 + weight_offset);     \
+    })
+
+#elif KERNEL_SIZE == 3
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                    \
+    ({                                                                                                               \
+        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                             \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                 \
+        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                             \
+        acc += (src0 + input_offset) * ((int8)weights_values0.s0 + weight_offset);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + input_offset) * ((int8)weights_values0.s1 + weight_offset); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                 \
+    ({                                                                                                            \
+        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                         \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                           \
+        int   src1            = convert_int(*(src_row_ptr + 16));                                                 \
+        acc += (src0.even + input_offset) * ((int8)weights_values0.s0 + weight_offset);                           \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + input_offset) * ((int8)weights_values0.s1 + weight_offset);      \
+        acc += ((int8)(src0.s2468, src0.sACE, src1) + input_offset) * ((int8)weights_values0.s2 + weight_offset); \
+    })
+
+#elif KERNEL_SIZE == 1
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL extract_input_stride3
+#elif STRIDE_X == 2
+#define INPUT_PIXEL extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL extract_input_stride1
+
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride1(__global const uchar *input_pixel)
+{
+    return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride2(__global const uchar *input_pixel)
+{
+    uchar16 temp = vload16(0, input_pixel);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input pixels.
+ */
+inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
+{
+    uchar16 temp1 = vload16(0, input_pixel);
+    uchar16 temp2 = vload16(0, input_pixel + 12);
+    return (uchar8)(temp1.s0369, temp2.s0369);
+}
+
+#else /* KERNEL_SIZE not equals 1, 3 or 5 */
+#error "Only kernel sizes 1, 3 and 5 are supported"
+#endif /* KERNEL_SIZE */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  input_offset                          Input offset quantization parameter
+ * @param[in]  weight_offset                         Weights offset quantization parameter
+ * @param[in]  output_offset                         Output offset quantization parameter
+ * @param[in]  output_multiplier                     Output integer multiplier quantization parameter
+ * @param[in]  output_shift                          Output integer shift quantization parameter
+ */
+__kernel void direct_convolution_1x1_3x3_5x5_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w,
+    int          input_offset,
+    int          weight_offset,
+    int          output_offset,
+    int          output_multiplier,
+    int          output_shift)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int8 pixels0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+#if KERNEL_SIZE == 5
+        CONVOLUTION1x5(pixels0, (__global uchar *)src_addr, (__global uchar *)weights_addr);
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 3 * src_stride_y), (__global uchar *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(pixels0, (__global uchar *)(src_addr + 4 * src_stride_y), (__global uchar *)(weights_addr + 4 * weights_stride_y));
+#elif KERNEL_SIZE == 3
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 0 * src_stride_y), (__global uchar *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 1 * src_stride_y), (__global uchar *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(pixels0, (__global uchar *)(src_addr + 2 * src_stride_y), (__global uchar *)(weights_addr + 2 * weights_stride_y));
+#elif KERNEL_SIZE == 1
+        int weight       = convert_int(*(__global uchar *)weights_addr);
+        int8 input_pixel = convert_int8(INPUT_PIXEL((__global uchar *)src_addr));
+        pixels0 += (input_pixel + input_offset) * ((int8)weight + weight_offset);
+#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
+    pixels0 += (int8)(*bias_addr);
+#endif /* defined(HAS_BIAS) */
+
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8);
+    pixels0 = pixels0 + output_offset;
+
+    vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)

diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h
index 5476a6e..d55346b 100644
--- a/src/core/CL/cl_kernels/fixed_point.h
+++ b/src/core/CL/cl_kernels/fixed_point.h

@@ -103,11 +103,11 @@
 #define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
 
 /** Computes saturating absolute value of fixed point vector.
-  *
-  * @param[in] type the actual data type.
-  *
-  * @return The result of the fixed point absolute value.
-  */
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point absolute value.
+ */
 #define ABSQ_SAT_IMPL(type)                  \
     inline type abs_##type##_sat(type VopA)  \
     {                                        \
@@ -121,11 +121,11 @@
 #define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size)
 
 /** Computes max of fixed point types.
-  *
-  * @param[in] type the actual data type.
-  *
-  * @return The result of the fixed point maximum.
-  */
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point maximum.
+ */
 #define MAXQ_IMPL(type)                          \
     inline type max_##type(type VopA, type VopB) \
     {                                            \
@@ -147,11 +147,11 @@
 #define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size)
 
 /** Computes saturated addition of fixed point types.
-  *
-  * @param[in] type the actual data type.
-  *
-  * @return The result of the fixed point addition. The result is saturated in case of overflow
-  */
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point addition. The result is saturated in case of overflow
+ */
 #define ADDQ_SAT_IMPL(type)                          \
     inline type add_sat_##type(type VopA, type VopB) \
     {                                                \
@@ -178,11 +178,11 @@
 #define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size)
 
 /** Computes saturated subtraction of fixed point types.
-  *
-  * @param[in] type the actual data type.
-  *
-  * @return The result of the fixed point subtraction. The result is saturated in case of overflow
-  */
+ *
+ * @param[in] type the actual data type.
+ *
+ * @return The result of the fixed point subtraction. The result is saturated in case of overflow
+ */
 #define SUBQ_SAT_IMPL(type)                          \
     inline type sub_sat_##type(type VopA, type VopB) \
     {                                                \
@@ -258,12 +258,12 @@
 #define MUL_SAT_OP_EXPAND(a, b, type, size, position) MUL_SAT_OP_EXPAND_STR(a, b, type, size, position)
 
 /** Saturate multiply-accumulate
-  *
-  * @param[in] type  the actual data type.
-  * @param[in] itype the intermediate data type.
-  *
-  * @return The result of the fixed point multiply-accumulate. The result is saturated in case of overflow
-  */
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate. The result is saturated in case of overflow
+ */
 #define MLAQ_SAT_IMPL(type, itype)                                                                                 \
     type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position)                                 \
     {                                                                                                              \
@@ -279,12 +279,12 @@
 #define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
 
 /** Saturate multiply-accumulate long
-  *
-  * @param[in] type  the actual data type.
-  * @param[in] itype the intermediate data type.
-  *
-  * @return The result of the fixed point multiply-accumulate long. The result is saturated in case of overflow
-  */
+ *
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point multiply-accumulate long. The result is saturated in case of overflow
+ */
 #define MLALQ_SAT_IMPL(type, itype)                                                                                \
     itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position)                              \
     {                                                                                                              \
@@ -299,13 +299,13 @@
 #define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position)
 
 /** Saturate division of two fixed point vectors
-  *
-  * @param[in] stype the actual scalar data type.
-  * @param[in] type  the actual data type.
-  * @param[in] itype the intermediate data type.
-  *
-  * @return The result of the fixed point division. The result is saturated in case of overflow
-  */
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] itype the intermediate data type.
+ *
+ * @return The result of the fixed point division. The result is saturated in case of overflow
+ */
 #define DIVQ_SAT_IMPL(stype, type, itype)                                                                                                                                           \
     inline type div_sat_##type(type VopA, type VopB, int fixed_point_position)                                                                                                      \
     {                                                                                                                                                                               \
@@ -329,15 +329,15 @@
 #define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position)
 
 /** Saturate exponential of a fixed point vector
-  *
-  * @note Implemented approach uses taylor polynomial to approximate the exponential function.
-  *
-  * @param[in] stype the actual scalar data type.
-  * @param[in] type  the actual data type.
-  * @param[in] size  the number of the calculated elements.
-  *
-  * @return The result of the fixed point exponential. The result is saturated in case of overflow
-  */
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the exponential function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point exponential. The result is saturated in case of overflow
+ */
 #define EXPQ_IMPL(stype, type, size)                                                                                                              \
     inline type exp_sat_##type(type VopA, int fixed_point_position)                                                                               \
     {                                                                                                                                             \
@@ -359,7 +359,12 @@
         return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), clz(sum) > dec_m); /* Saturate result if needed */ \
     }
 
+EXPQ_IMPL(qs8, qs8x2, 2)
+EXPQ_IMPL(qs8, qs8x4, 4)
+EXPQ_IMPL(qs8, qs8x8, 8)
 EXPQ_IMPL(qs8, qs8x16, 16)
+EXPQ_IMPL(qs16, qs16x2, 2)
+EXPQ_IMPL(qs16, qs16x4, 4)
 EXPQ_IMPL(qs16, qs16x8, 8)
 EXPQ_IMPL(qs16, qs16x16, 16)
 
@@ -367,15 +372,15 @@
 #define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position)
 
 /** Saturate logarithm of a fixed point vector
-  *
-  * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
-  *
-  * @param[in] stype the actual scalar data type.
-  * @param[in] type  the actual data type.
-  * @param[in] size  the number of the calculated elements.
-  *
-  * @return The result of the fixed point logarithm. The result is saturated in case of overflow
-  */
+ *
+ * @note Implemented approach uses taylor polynomial to approximate the logarithm function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point logarithm. The result is saturated in case of overflow
+ */
 #define LOGQ_IMPL(stype, type, size)                                                                                                       \
     inline type log_sat_##type(type VopA, int fixed_point_position)                                                                        \
     {                                                                                                                                      \
@@ -405,15 +410,15 @@
 #define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position)
 
 /** Saturate inverse square root of a fixed point vector
-  *
-  * @note Implemented approach uses Newton's method to approximate the inverse square root function.
-  *
-  * @param[in] stype the actual scalar data type.
-  * @param[in] type  the actual data type.
-  * @param[in] size  the number of the calculated elements.
-  *
-  * @return The result of the fixed point inverse square root. The result is saturated in case of overflow
-  */
+ *
+ * @note Implemented approach uses Newton's method to approximate the inverse square root function.
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point inverse square root. The result is saturated in case of overflow
+ */
 #define INVSQRTQ_IMPL(stype, type, size)                                                                                                                                                                                               \
     inline type invsqrt_sat_##type(type VopA, int fixed_point_position)                                                                                                                                                                \
     {                                                                                                                                                                                                                                  \
@@ -442,15 +447,15 @@
 #define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position)
 
 /** Saturate hyperbolic tangent of a fixed point vector
-  *
-  * tanh(x) = (e^2x - 1)/(e^2x + 1)
-  *
-  * @param[in] stype the actual scalar data type.
-  * @param[in] type  the actual data type.
-  * @param[in] size  the number of the calculated elements.
-  *
-  * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of overflow
-  */
+ *
+ * tanh(x) = (e^2x - 1)/(e^2x + 1)
+ *
+ * @param[in] stype the actual scalar data type.
+ * @param[in] type  the actual data type.
+ * @param[in] size  the number of the calculated elements.
+ *
+ * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of overflow
+ */
 #define TANHQ_IMPL(stype, type, size)                                                                                                             \
     inline type tanh_sat_##type(type VopA, int fixed_point_position)                                                                              \
     {                                                                                                                                             \

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 35a2e47..c763cb3 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -80,10 +80,10 @@
     uint x = get_global_id(0);
     uint y = get_global_id(1);
 
-    /* Compute address for Matrix B - source */
+    // Compute address for Matrix B - source
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
-    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    // Compute address for Matrix B transposed - destination. X and Y are swapped
     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
 
     ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
@@ -112,10 +112,10 @@
     uint x = get_global_id(0);
     uint y = get_global_id(1);
 
-    /* Compute address for Matrix B - source */
+    // Compute address for Matrix B - source
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
-    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    // Compute address for Matrix B transposed - destination. X and Y are swapped
     uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
 
     uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
@@ -141,11 +141,11 @@
 __kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
                                        IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from Matrix A */
+    // Load values from Matrix A
     uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
     uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
     uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
@@ -182,11 +182,11 @@
 __kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
                                        IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from Matrix A */
+    // Load values from Matrix A
     ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
     ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
     ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
@@ -223,11 +223,11 @@
 __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
                                       IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from Matrix A */
+    // Load values from Matrix A
     uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
     uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
     uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
@@ -250,155 +250,11 @@
     vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
 }
 
-/** This kernel accumulates each row with the biases vector
- *
- * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-#ifdef DATA_TYPE
-__kernel void gemm_accumulate_biases(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr);
-#ifdef FIXED_POINT_POSITION
-    accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, 16);
-#else  // FIXED_POINT_POSITION
-    accum_value = biases_value + accum_value;
-#endif // FIXED_POINT_POSITION
-
-    // Store result in the accummulate buffer
-    vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr);
-}
-#endif /* DATA_TYPE */
-
-#ifdef COLS_B
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
- *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported formats: U8
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported formats: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported formats: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  a_offset                           Offset to be added to each element of the matrix A
- * @param[in]  b_offset                           Offset to be added to each element of the matrix B.
- * @param[in]  c_offset                           Offset to be added to each element of the matrix C.
- * @param[in]  c_mult_int                         Multiplied with each element of the matrix C.
- * @param[in]  shift                              Number of bits to shift right the result.
- */
-__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),
-                                                IMAGE_DECLARATION(src1),
-                                                IMAGE_DECLARATION(dst),
-                                                int a_offset,
-                                                int b_offset,
-                                                int c_offset,
-                                                int c_mult_int,
-                                                int shift)
-{
-    /* src_addr.s0 = address of matrix A */
-    /* src_addr.s1 = address of matrix B */
-
-    /* Compute address for matrix A and B */
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
-
-    /* Add offset_first_element_in_bytes */
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    /* Compute end row address for matrix B */
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
-
-    /* Reset accumulators */
-    int16 c00 = 0.0f;
-    int16 c10 = 0.0f;
-    int16 c20 = 0.0f;
-    int16 c30 = 0.0f;
-
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        int8 a0  = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-
-        int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
-
-        c00 += (int16)a0.s4 * b1;
-        c10 += (int16)a0.s5 * b1;
-        c20 += (int16)a0.s6 * b1;
-        c30 += (int16)a0.s7 * b1;
-    }
-
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        int4 a0  = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
-        int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
-
-        c00 += (int16)a0.s0 * b0;
-        c10 += (int16)a0.s1 * b0;
-        c20 += (int16)a0.s2 * b0;
-        c30 += (int16)a0.s3 * b0;
-    }
-
-    /* Compute destination address */
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    /* Multiply by the weight of matrix product */
-    c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;
-    c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;
-    c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;
-    c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;
-
-    /* Store 4x16 block */
-    vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));
-    vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));
-    vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));
-    vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));
-}
-#endif /* COLS_B */
-
-#if defined(COLS_B) && defined(ALPHA)
+#if defined(COLS_B)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @attention The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -423,23 +279,23 @@
                                                          IMAGE_DECLARATION(src1),
                                                          IMAGE_DECLARATION(dst))
 {
-    /* src_addr.s0 = address of matrix A */
-    /* src_addr.s1 = address of matrix B */
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
 
-    /* Compute address for matrix A and B */
+    // Compute address for matrix A and B
     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
                                                                         (src1_stride_y));
 
-    /* Add offset_first_element_in_bytes */
+    // Add offset_first_element_in_bytes
     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    /* Divide by 4 in order to get the src_addr in unit of float */
+    // Divide by 4 in order to get the src_addr in unit of float
     src_addr = src_addr >> 2;
 
-    /* Compute end row address for matrix B */
+    // Compute end row address for matrix B
     int end_row_mtx_b = src_addr.s1 + COLS_B;
 
-    /* Reset accumulators */
+    // Reset accumulators
     float4 c00 = 0.0f;
     float4 c10 = 0.0f;
     float4 c20 = 0.0f;
@@ -447,7 +303,7 @@
 
     for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
     {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
         float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
 
@@ -456,7 +312,7 @@
         c20 += (float4)a0.s2 * b0;
         c30 += (float4)a0.s3 * b0;
 
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
         b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
 
@@ -468,7 +324,7 @@
 
     for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
     {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
         float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
 
@@ -478,26 +334,28 @@
         c30 += (float4)a0.s3 * b0;
     }
 
-    /* Compute destination address */
+    // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of matrix product */
+#if defined(ALPHA)
+    // Multiply by the weight of matrix product
     c00 = c00 * (float4)ALPHA;
     c10 = c10 * (float4)ALPHA;
     c20 = c20 * (float4)ALPHA;
     c30 = c30 * (float4)ALPHA;
+#endif // defined(ALPHA)
 
-    /* Store 4x4 block */
+    // Store 4x4 block
     vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
     vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
     vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
     vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
 }
 
-/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -677,6 +535,7 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(ALPHA)
     // Multiply by the weight of matrix product
     c00 = c00 * ALPHA;
     c01 = c01 * ALPHA;
@@ -694,6 +553,7 @@
     c31 = c31 * ALPHA;
     c32 = c32 * ALPHA;
     c33 = c33 * ALPHA;
+#endif // defined(ALPHA)
 
     barrier(CLK_GLOBAL_MEM_FENCE);
 
@@ -704,10 +564,11 @@
     vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));
 }
 
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -732,23 +593,23 @@
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    /* src_addr.s0 = address of matrix A */
-    /* src_addr.s1 = address of matrix B */
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
 
-    /* Compute address for matrix A and B */
+    // Compute address for matrix A and B
     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
                                                                         (src1_stride_y));
 
-    /* Add offset_first_element_in_bytes */
+    // Add offset_first_element_in_bytes
     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    /* Divide by 2 in order to get the src_addr in unit of half */
+    // Divide by 2 in order to get the src_addr in unit of half
     src_addr = src_addr >> 1;
 
-    /* Compute end row address for matrix B */
+    // Compute end row address for matrix B
     int end_row_mtx_b = src_addr.s1 + COLS_B;
 
-    /* Reset accumulators */
+    // Reset accumulators
     half8 c00 = 0.0f;
     half8 c10 = 0.0f;
     half8 c20 = 0.0f;
@@ -756,7 +617,7 @@
 
     for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
     {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
         half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
 
@@ -765,7 +626,7 @@
         c20 += (half8)a0.s2 * b0;
         c30 += (half8)a0.s3 * b0;
 
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
         b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
 
@@ -777,7 +638,7 @@
 
     for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
     {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
         half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
 
@@ -787,27 +648,30 @@
         c30 += (half8)a0.s3 * b0;
     }
 
-    /* Compute destination address */
+    // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of matrix product */
+#if defined(ALPHA)
+    // Multiply by the weight of matrix product
     c00 = c00 * (half8)ALPHA;
     c10 = c10 * (half8)ALPHA;
     c20 = c20 * (half8)ALPHA;
     c30 = c30 * (half8)ALPHA;
+#endif // defined(ALPHA)
 
-    /* Store 4x8 block */
+    // Store 4x8 block
     vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));
     vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));
     vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));
     vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));
 }
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 
-#ifdef FIXED_POINT_POSITION
+#if defined(FIXED_POINT_POSITION)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
  *
- * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
+ * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
  *
  * @note: ALPHA must be passed in 8 bit fixed point format
  *
@@ -834,20 +698,20 @@
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    /* src_addr.s0 = address of matrix A */
-    /* src_addr.s1 = address of matrix B */
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
 
-    /* Compute address for matrix A and B */
+    // Compute address for matrix A and B
     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
                                                                         (src1_stride_y));
 
-    /* Add offset_first_element_in_bytes */
+    // Add offset_first_element_in_bytes
     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    /* Compute end row address for matrix B */
+    // Compute end row address for matrix B
     int end_row_mtx_b = src_addr.s1 + COLS_B;
 
-    /* Reset accumulators */
+    // Reset accumulators
     short8 c00 = 0.0f;
     short8 c10 = 0.0f;
     short8 c20 = 0.0f;
@@ -857,10 +721,10 @@
     short8 c21 = 0.0f;
     short8 c31 = 0.0f;
 
-    /* This for loop performs 1 accumulation for each iteration */
+    // This for loop performs 1 accumulation for each iteration
     for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
     {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        // Load values from matrix A (interleaved) and matrix B (transposed)
         char4  a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
         char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
 
@@ -875,21 +739,23 @@
         c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);
     }
 
-    /* Compute destination address */
+    // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of matrix product */
+    // Multiply by the weight of matrix product
     char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));
     char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));
     char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));
     char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));
 
+#if defined(ALPHA)
     c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
     c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
     c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
     c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
 
-    /* Store 16x4 block */
+    // Store 16x4 block
     vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
     vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
     vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
@@ -899,7 +765,7 @@
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
- * @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
+ * @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION
  *
  * @note: ALPHA must be passed in 16 bit fixed point format
  *
@@ -926,29 +792,29 @@
                                                   IMAGE_DECLARATION(src1),
                                                   IMAGE_DECLARATION(dst))
 {
-    /* src_addr.s0 = address of matrix A */
-    /* src_addr.s1 = address of matrix B */
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
 
-    /* Compute address for matrix A and B */
+    // Compute address for matrix A and B
     int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
                                                                         (src1_stride_y));
 
-    /* Add offset_first_element_in_bytes */
+    // Add offset_first_element_in_bytes
     src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
 
-    /* Divide by 2 in order to get the src_addr in unit of short */
+    // Divide by 2 in order to get the src_addr in unit of short
     src_addr = src_addr >> 1;
 
-    /* Compute end row address for matrix B */
+    // Compute end row address for matrix B
     int end_row_mtx_b = src_addr.s1 + COLS_B;
 
-    /* Reset accumulators */
+    // Reset accumulators
     int8 c00 = 0.0f;
     int8 c10 = 0.0f;
     int8 c20 = 0.0f;
     int8 c30 = 0.0f;
 
-    /* This for loop performs 1 accumulation for each iteration */
+    // This for loop performs 1 accumulation for each iteration
     for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
@@ -961,27 +827,30 @@
         c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);
     }
 
-    /* Compute destination address */
+    // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Multiply by the weight of matrix product */
+    // Multiply by the weight of matrix product
     short8 c00_qs16 = convert_short8_sat(c00);
     short8 c10_qs16 = convert_short8_sat(c10);
     short8 c20_qs16 = convert_short8_sat(c20);
     short8 c30_qs16 = convert_short8_sat(c30);
 
+#if defined(ALPHA)
     c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
     c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
     c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
     c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
 
-    /* Store 8x4 block */
+    // Store 8x4 block
     vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
     vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
     vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
     vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
 }
 #endif // defined(FIXED_POINT_POSITION)
+#endif // defined(COLS_B)
 
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
@@ -991,7 +860,7 @@
  * @note This OpenCL kernel works with floating point data types (F16/F32)
  * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The width of matrix A and the alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1040,7 +909,7 @@
     VECTOR_TYPE acc3 = 0.0f;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
     {
         // Load values from matrix A
         VEC_DATA_TYPE(DATA_TYPE, 2)
@@ -1111,35 +980,459 @@
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
     acc0 = acc0 * (VECTOR_TYPE)ALPHA;
+#endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
     (acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
     acc1 = acc1 * (VECTOR_TYPE)ALPHA;
+#endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
     (acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
     acc2 = acc2 * (VECTOR_TYPE)ALPHA;
+#endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
     (acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
     acc3 = acc3 * (VECTOR_TYPE)ALPHA;
+#endif // defined(ALPHA)
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
     (acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #endif // defined(DATA_TYPE)
 
-#ifdef FIXED_POINT_POSITION
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+    // Address boundary for matrix A
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+
+    // Initialize accumulators
+    float acc00 = 0.0f;
+    float acc01 = 0.0f;
+    float acc02 = 0.0f;
+    float acc03 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float acc10 = 0.0f;
+    float acc11 = 0.0f;
+    float acc12 = 0.0f;
+    float acc13 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float acc20 = 0.0f;
+    float acc21 = 0.0f;
+    float acc22 = 0.0f;
+    float acc23 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float acc30 = 0.0f;
+    float acc31 = 0.0f;
+    float acc32 = 0.0f;
+    float acc33 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    {
+        // Load values from matrix A
+        float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float2 a1 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float2 a2 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float2 a3 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+
+        // Multiply and accumulate
+        acc00 = fma(a0.s0, b0.s0, acc00);
+        acc00 = fma(a0.s1, b1.s0, acc00);
+        acc01 = fma(a0.s0, b0.s1, acc01);
+        acc01 = fma(a0.s1, b1.s1, acc01);
+        acc02 = fma(a0.s0, b0.s2, acc02);
+        acc02 = fma(a0.s1, b1.s2, acc02);
+        acc03 = fma(a0.s1, b1.s3, acc03);
+        acc03 = fma(a0.s0, b0.s3, acc03);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc10 = fma(a1.s0, b0.s0, acc10);
+        acc11 = fma(a1.s0, b0.s1, acc11);
+        acc12 = fma(a1.s0, b0.s2, acc12);
+        acc13 = fma(a1.s0, b0.s3, acc13);
+
+        acc10 = fma(a1.s1, b1.s0, acc10);
+        acc11 = fma(a1.s1, b1.s1, acc11);
+        acc12 = fma(a1.s1, b1.s2, acc12);
+        acc13 = fma(a1.s1, b1.s3, acc13);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc20 = fma(a2.s0, b0.s0, acc20);
+        acc21 = fma(a2.s0, b0.s1, acc21);
+        acc22 = fma(a2.s0, b0.s2, acc22);
+        acc23 = fma(a2.s0, b0.s3, acc23);
+
+        acc20 = fma(a2.s1, b1.s0, acc20);
+        acc21 = fma(a2.s1, b1.s1, acc21);
+        acc22 = fma(a2.s1, b1.s2, acc22);
+        acc23 = fma(a2.s1, b1.s3, acc23);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc30 = fma(a3.s0, b0.s0, acc30);
+        acc31 = fma(a3.s0, b0.s1, acc31);
+        acc32 = fma(a3.s0, b0.s2, acc32);
+        acc33 = fma(a3.s0, b0.s3, acc33);
+
+        acc30 = fma(a3.s1, b1.s0, acc30);
+        acc31 = fma(a3.s1, b1.s1, acc31);
+        acc32 = fma(a3.s1, b1.s2, acc32);
+        acc33 = fma(a3.s1, b1.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+    {
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        // Multiply and accumulate
+        acc00 = fma(a0, b0.s0, acc00);
+        acc01 = fma(a0, b0.s1, acc01);
+        acc02 = fma(a0, b0.s2, acc02);
+        acc03 = fma(a0, b0.s3, acc03);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc10 = fma(a1, b0.s0, acc10);
+        acc11 = fma(a1, b0.s1, acc11);
+        acc12 = fma(a1, b0.s2, acc12);
+        acc13 = fma(a1, b0.s3, acc13);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc20 = fma(a2, b0.s0, acc20);
+        acc21 = fma(a2, b0.s1, acc21);
+        acc22 = fma(a2, b0.s2, acc22);
+        acc23 = fma(a2, b0.s3, acc23);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc30 = fma(a3, b0.s0, acc30);
+        acc31 = fma(a3, b0.s1, acc31);
+        acc32 = fma(a3, b0.s2, acc32);
+        acc33 = fma(a3, b0.s3, acc33);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    acc00 = acc00 * ALPHA;
+    acc01 = acc01 * ALPHA;
+    acc02 = acc02 * ALPHA;
+    acc03 = acc03 * ALPHA;
+#endif // defined(ALPHA)
+
+    float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));
+    vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+    acc10 = acc10 * ALPHA;
+    acc11 = acc11 * ALPHA;
+    acc12 = acc12 * ALPHA;
+    acc13 = acc13 * ALPHA;
+#endif // defined(ALPHA)
+    float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));
+    vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+    acc20 = acc20 * ALPHA;
+    acc21 = acc21 * ALPHA;
+    acc22 = acc22 * ALPHA;
+    acc23 = acc23 * ALPHA;
+#endif // defined(ALPHA)
+    float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));
+    vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+    acc30 = acc30 * ALPHA;
+    acc31 = acc31 * ALPHA;
+    acc32 = acc32 * ALPHA;
+    acc33 = acc33 * ALPHA;
+#endif // defined(ALPHA)
+    float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));
+    vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+                                                      IMAGE_DECLARATION(dst))
+{
+    // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+    // Address boundary for the matrix A
+    int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));
+
+    // Initialize accumulators
+    float acc00 = 0.0f;
+    float acc01 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float acc10 = 0.0f;
+    float acc11 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float acc20 = 0.0f;
+    float acc21 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float acc30 = 0.0f;
+    float acc31 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))
+    {
+        // Load values from matrix A
+        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));
+        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));
+        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));
+        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));
+
+        // Multiply and accumulate
+        acc00 = fma(a0.s0, b0.s0, acc00);
+        acc00 = fma(a0.s1, b1.s0, acc00);
+        acc00 = fma(a0.s2, b2.s0, acc00);
+        acc00 = fma(a0.s3, b3.s0, acc00);
+
+        acc01 = fma(a0.s0, b0.s1, acc01);
+        acc01 = fma(a0.s1, b1.s1, acc01);
+        acc01 = fma(a0.s2, b2.s1, acc01);
+        acc01 = fma(a0.s3, b3.s1, acc01);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+        acc10 = fma(a0.s0, b0.s0, acc10);
+        acc10 = fma(a0.s1, b1.s0, acc10);
+        acc10 = fma(a0.s2, b2.s0, acc10);
+        acc10 = fma(a0.s3, b3.s0, acc10);
+
+        acc11 = fma(a0.s0, b0.s1, acc11);
+        acc11 = fma(a0.s1, b1.s1, acc11);
+        acc11 = fma(a0.s2, b2.s1, acc11);
+        acc11 = fma(a0.s3, b3.s1, acc11);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+        acc20 = fma(a0.s0, b0.s0, acc20);
+        acc20 = fma(a0.s1, b1.s0, acc20);
+        acc20 = fma(a0.s2, b2.s0, acc20);
+        acc20 = fma(a0.s3, b3.s0, acc20);
+
+        acc21 = fma(a0.s0, b0.s1, acc21);
+        acc21 = fma(a0.s1, b1.s1, acc21);
+        acc21 = fma(a0.s2, b2.s1, acc21);
+        acc21 = fma(a0.s3, b3.s1, acc21);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        a0    = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+        acc30 = fma(a0.s0, b0.s0, acc30);
+        acc30 = fma(a0.s1, b1.s0, acc30);
+        acc30 = fma(a0.s2, b2.s0, acc30);
+        acc30 = fma(a0.s3, b3.s0, acc30);
+
+        acc31 = fma(a0.s0, b0.s1, acc31);
+        acc31 = fma(a0.s1, b1.s1, acc31);
+        acc31 = fma(a0.s2, b2.s1, acc31);
+        acc31 = fma(a0.s3, b3.s1, acc31);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+    // float size increment
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))
+    {
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+
+        // Multiply and accumulate
+        acc00 = fma(a0, b0.s0, acc00);
+        acc01 = fma(a0, b0.s1, acc01);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc10 = fma(a1, b0.s0, acc10);
+        acc11 = fma(a1, b0.s1, acc11);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc20 = fma(a2, b0.s0, acc20);
+        acc21 = fma(a2, b0.s1, acc21);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc30 = fma(a3, b0.s0, acc30);
+        acc31 = fma(a3, b0.s1, acc31);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    acc00 = acc00 * ALPHA;
+    acc01 = acc01 * ALPHA;
+#endif // defined(ALPHA)
+    float2 acc0 = ((float2)(acc00, acc01));
+    vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+    acc10 = acc10 * ALPHA;
+    acc11 = acc11 * ALPHA;
+#endif // defined(ALPHA)
+    float2 acc1 = ((float2)(acc10, acc11));
+    vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+    acc20 = acc20 * ALPHA;
+    acc21 = acc21 * ALPHA;
+#endif // defined(ALPHA)
+    float2 acc2 = ((float2)(acc20, acc21));
+    vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+    acc30 = acc30 * ALPHA;
+    acc31 = acc31 * ALPHA;
+#endif // defined(ALPHA)
+    float2 acc3 = (float2)(acc30, acc31);
+    vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(FIXED_POINT_POSITION)
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
  * @note This OpenCL kernel works with fixed point data types QS8
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
- * @note The alpha value must be passed in 8 bit fixed point format using -DALPHA
+ * @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1269,21 +1562,29 @@
     // Multiply by the weight of matrix product and store the result
     char16 acc_qs8;
     acc_qs8 = convert_char16_sat((short16)(acc00, acc01));
+#if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     acc_qs8 = convert_char16_sat((short16)(acc10, acc11));
+#if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     acc_qs8 = convert_char16_sat((short16)(acc20, acc21));
+#if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     acc_qs8 = convert_char16_sat((short16)(acc30, acc31));
+#if defined(ALPHA)
     acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
@@ -1292,9 +1593,9 @@
  *
  * @note This OpenCL kernel works with fixed point data types QS16
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
- * @note The width of matrix A, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
+ * @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA
  * @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION
- * @note The alpha value must be passed in 16 bit fixed point format using -DALPHA
+ * @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA
  *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: QS8/QS16
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
@@ -1344,7 +1645,7 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
     // This for loop performs 4 accumulations per iteration
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))
     {
         short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -1408,29 +1709,36 @@
     // Multiply by the weight of matrix product and store the result
     short8 acc_qs16;
     acc_qs16 = convert_short8_sat(acc0);
+#if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     acc_qs16 = convert_short8_sat(acc1);
+#if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     acc_qs16 = convert_short8_sat(acc2);
+#if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     acc_qs16 = convert_short8_sat(acc3);
+#if defined(ALPHA)
     acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);
+#endif // defined(ALPHA)
     vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #endif // defined(FIXED_POINT_POSITION)
 #endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
-#endif // defined(COLS_B) && defined(ALPHA)
 
-#ifdef BETA
+#if defined(BETA)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @attention The beta's value need to be passed at compile time using -DBETA
@@ -1451,20 +1759,20 @@
 __kernel void gemm_ma_f32(IMAGE_DECLARATION(src),
                           IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from A x B */
+    // Load values from A x B
     float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
 
-    /* Load values from Matrix C */
+    // Load values from Matrix C
     float4 c = vload4(0, (__global float *)src.ptr);
 
-    /* Computes alpha * axb + beta * c */
+    // Computes alpha * axb + beta * c
     float4 out = alpha_ab + (float4)BETA * c;
 
-    /* Store final result in axb matrix */
+    // Store final result in axb matrix
     vstore4(out, 0, (__global float *)dst.ptr);
 }
 
@@ -1488,24 +1796,24 @@
 __kernel void gemm_ma_f16(IMAGE_DECLARATION(src),
                           IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from A x B */
+    // Load values from A x B
     half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
 
-    /* Load values from Matrix C */
+    // Load values from Matrix C
     half8 c = vload8(0, (__global half *)src.ptr);
 
-    /* Computes alpha * axb + beta * c */
+    // Computes alpha * axb + beta * c
     half8 out = alpha_ab + (half8)BETA * c;
 
-    /* Store final result in axb matrix */
+    // Store final result in axb matrix
     vstore8(out, 0, (__global half *)dst.ptr);
 }
 
-#ifdef FIXED_POINT_POSITION
+#if defined(FIXED_POINT_POSITION)
 /** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION
@@ -1528,20 +1836,20 @@
 __kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),
                           IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from A x B */
+    // Load values from A x B
     char16 alpha_ab = vload16(0, (__global char *)dst.ptr);
 
-    /* Load values from Matrix C */
+    // Load values from Matrix C
     char16 c = vload16(0, (__global char *)src.ptr);
 
-    /* Computes alpha * axb + beta * c */
+    // Computes alpha * axb + beta * c
     char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);
 
-    /* Store final result in axb matrix */
+    // Store final result in axb matrix
     vstore16(out, 0, (__global char *)dst.ptr);
 }
 
@@ -1567,26 +1875,26 @@
 __kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),
                            IMAGE_DECLARATION(dst))
 {
-    /* Compute source and destination addresses */
+    // Compute source and destination addresses
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
-    /* Load values from A x B */
+    // Load values from A x B
     short8 alpha_ab = vload8(0, (__global short *)dst.ptr);
 
-    /* Load values from Matrix C */
+    // Load values from Matrix C
     short8 c = vload8(0, (__global short *)src.ptr);
 
-    /* Computes alpha * axb + beta * c */
+    // Computes alpha * axb + beta * c
     short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);
 
-    /* Store final result in axb matrix */
+    // Store final result in axb matrix
     vstore8(out, 0, (__global short *)dst.ptr);
 }
-#endif /* defined(FIXED_POINT_POSITION) */
-#endif /* defined(BETA) */
+#endif // defined(FIXED_POINT_POSITION)
+#endif // defined(BETA)
 
-#ifdef WIDTH_VECTOR_A
+#if defined(WIDTH_VECTOR_A)
 /** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer
  *
  * @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
@@ -1621,7 +1929,7 @@
     int idx = get_global_id(0) * 4;
     int idy = get_global_id(1);
 
-    /* Compute the address for the vector A and matrix B */
+    // Compute the address for the vector A and matrix B
     int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));
     src_addr.s1 += idx * sizeof(float);
 
@@ -1629,7 +1937,7 @@
 
     float4 acc = 0.0f;
 
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
     {
         float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
         float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
@@ -1647,9 +1955,49 @@
         acc += b0 * (float4)a0;
     }
 
-    /* Compute destination address */
+    // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
     vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
 }
-#endif /* WIDTH_VECTOR_A */
\ No newline at end of file
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/QS8/U16/S16/F16/U32/S32/F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(
+    IMAGE_DECLARATION(accum),
+    VECTOR_DECLARATION(biases))
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    // Vector size, i.e. number of vector elements.
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+#ifdef FIXED_POINT_POSITION
+    accum_value = ADD_SAT_OP_EXPAND(biases_value, accum_value, DATA_TYPE, VECTOR_SIZE);
+#else  // FIXED_POINT_POSITION
+    accum_value = biases_value + accum_value;
+#endif // FIXED_POINT_POSITION
+    // Store result in the accumulate buffer
+    VSTORE(VECTOR_SIZE)
+    (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 0000000..a928813
--- /dev/null
+++ b/src/core/CL/cl_kernels/gemmlowp.cl

@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if defined(COLS_B)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication
+ *
+ * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm_interleaved_transposed(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+                                                 IMAGE_DECLARATION(dst))
+{
+    // src_addr.s0 = address of matrix A
+    // src_addr.s1 = address of matrix B
+    // Compute address for matrix A and B
+    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
+                                                                        (src1_stride_y));
+
+    // Add offset_first_element_in_bytes
+    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Compute end row address for matrix B
+    int end_row_mtx_b = src_addr.s1 + COLS_B;
+
+    // Reset accumulators
+    int16 c00 = 0;
+    int16 c10 = 0;
+    int16 c20 = 0;
+    int16 c30 = 0;
+
+    for(; src_addr.s1 <= (end_row_mtx_b - 32); src_addr += (int2)(8, 32))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        int8 a0  = convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+
+        int16 b1 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));
+
+        c00 += (int16)a0.s4 * b1;
+        c10 += (int16)a0.s5 * b1;
+        c20 += (int16)a0.s6 * b1;
+        c30 += (int16)a0.s7 * b1;
+    }
+
+    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        int4 a0  = convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));
+        int16 b0 = convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));
+
+        c00 += (int16)a0.s0 * b0;
+        c10 += (int16)a0.s1 * b0;
+        c20 += (int16)a0.s2 * b0;
+        c30 += (int16)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store 4x16 block
+    vstore16(c00, 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore16(c10, 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore16(c20, 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore16(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+}
+#endif // defined(COLS_B)
+
+#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data type: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+__kernel void gemmlowp_mm(IMAGE_DECLARATION(src0),
+                          IMAGE_DECLARATION(src1),
+                          IMAGE_DECLARATION(dst))
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx;
+
+    int end_row_vec_a = src_addr.s0 + COLS_A;
+
+    VECTOR_UINT acc0 = 0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VECTOR_UINT acc1 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VECTOR_UINT acc2 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VECTOR_UINT acc3 = 0;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+        VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;
+        acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;
+        acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;
+        acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;
+        acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    {
+        // Load values from matrix A
+        uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        // Load values from matrix B
+        VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);
+
+        // Accumulate
+        acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store the result
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint4 sum_row_u32 = (uint4)0;
+    uint  sum_row     = 0;
+
+    __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const uchar16 a0_u8 = vload16(0, matrix_a + i);
+
+        sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += matrix_a[i];
+    }
+
+    sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;
+
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint16 sum_col_u32 = (uint16)0;
+
+    __global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);
+        const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);
+        const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);
+        const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const uchar16 b0_u8 = vload16(0, matrix_b);
+
+        sum_col_u32 += convert_uint16(b0_u8);
+
+        matrix_b += src_stride_y;
+    }
+
+    vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#if defined(K_OFFSET)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                                Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                           Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                             mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                           Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                             mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                             mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes      The offset of the first element in the source tensor
+ * @param[in] sum_col_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_result_step_x                        sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_result_step_y                        sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_row_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_result_step_x                        sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_result_step_y                        sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                          )
+{
+    Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
+
+    int16 a_offset_s32 = (int16)0;
+    int16 b_offset_s32 = (int16)0;
+
+#if defined(A_OFFSET)
+    Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
+#else  // defined(MATRIX_B_HAS_BATCHES)
+    a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));
+#endif // defined(MATRIX_B_HAS_BATCHES)
+
+    a_offset_s32 *= (int16)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
+
+    // Compute the offset contribution due to B_OFFSET
+    b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
+    b_offset_s32 *= (int16)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+    const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;
+
+    int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    vstore16(in_s32, 0, (__global int *)mm_result.ptr);
+}
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#if defined(ADD_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+#endif // defined(ADD_BIAS)
+
+    int16 input_values = vload16(0, (__global int *)src.ptr);
+
+    // Add the offset terms to GEMM's result
+    input_values += (int16)RESULT_OFFSET;
+
+#if defined(ADD_BIAS)
+    // Add bias
+    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
+    input_values += (int16)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+    input_values *= RESULT_MULT_INT;
+
+    input_values >>= RESULT_SHIFT;
+
+    uchar16 res = convert_uchar16_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore16(res, 0, dst.ptr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#if defined(ADD_BIAS)
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+#endif // defined(ADD_BIAS)
+
+    int16 input_values = vload16(0, (__global int *)src.ptr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
+    input_values += (int16)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
+
+    // Add the offset terms to GEMM's result
+    input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
+
+    uchar16 res = convert_uchar16_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar16)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar16)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore16(res, 0, dst.ptr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index 76128f7..3e38c73 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl

@@ -35,7 +35,7 @@
  * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
  * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
  * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 4421e74..768f7ee 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -24,7 +24,10 @@
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
 #if defined(ARM_COMPUTE_DEBUG_ENABLED)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
@@ -42,6 +45,9 @@
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
+#define CL_VEC_DATA_TYPE_STR(type, size) type##size
+#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size)
+
 #define CONVERT_STR(x, type) (convert_##type((x)))
 #define CONVERT(x, type) CONVERT_STR(x, type)
 

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
new file mode 100644
index 0000000..b44d0f1
--- /dev/null
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_HELPERS_ASYMM_H
+#define ARM_COMPUTE_HELPERS_ASYMM_H
+
+#include "helpers.h"
+
+/** Correctly-rounded-to-nearest division by a power-of-two.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Correctly-rounded-to-nearest division by a power-of-two.
+ */
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                   \
+    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                                              \
+        VEC_DATA_TYPE(int, size)                                                                                   \
+        mask = (1 << exponent) - 1;                                                                                \
+        const VEC_DATA_TYPE(int, size) zero = 0;                                                                   \
+        const VEC_DATA_TYPE(int, size) one  = 1;                                                                   \
+        VEC_DATA_TYPE(int, size)                                                                                   \
+        threshold = (mask >> 1) + select(zero, one, x < 0);                                                        \
+        return (x >> exponent) + select(zero, one, (x & mask) > threshold);                                        \
+    }
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
+
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
+
+/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
+ * rounding to the nearest value, and saturating -1 * -1 to the maximum value.
+ *
+ * @param[in] size Size of vector.
+ *
+ * @return Product of two fixed-point numbers.
+ */
+#define ASYMM_MULT_IMP(size)                                                                                 \
+    inline VEC_DATA_TYPE(int, size) asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        overflow = a == b && a == INT_MIN;                                                                   \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        a_64 = convert_long##size(a);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        b_64 = convert_long##size(b);                                                                        \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        ab_64 = a_64 * b_64;                                                                                 \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        mask1 = 1 << 30;                                                                                     \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        mask2 = 1 - (1 << 30);                                                                               \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        nudge = select(mask2, mask1, ab_64 >= 0);                                                            \
+        VEC_DATA_TYPE(long, size)                                                                            \
+        mask = 1ll << 31;                                                                                    \
+        VEC_DATA_TYPE(int, size)                                                                             \
+        ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                                            \
+        return select(ab_x2_high32, INT_MAX, overflow);                                                      \
+    }
+
+ASYMM_MULT_IMP(2)
+ASYMM_MULT_IMP(8)
+ASYMM_MULT_IMP(16)
+
+#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
+    ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
+
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H

diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index e9845e0..e9b5e97 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl

@@ -74,7 +74,7 @@
     float16 arct = atan2pi(convert_float16(b), convert_float16(a));
     arct         = select(arct, arct + 2, arct < 0.0f);
 
-    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & 0xFFu);
+    return convert_uchar16(convert_int16(mad(arct, 128, 0.5f)) & (int16)0xFFu);
 }
 
 #if(1 == MAGNITUDE)

diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 4e65560..bc00252 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl

@@ -50,7 +50,7 @@
 
 #endif // FIXED_POINT_POSITION
 
-/** Apply cross map normalization.
+/** Apply cross-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -92,14 +92,13 @@
     kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
     const int current_slice = get_global_id(2);
-
-    const int left_slice  = max(current_slice - (int)RADIUS, (int)0);
-    const int right_slice = min(current_slice + (int)RADIUS, (int)(NUM_SLICES - 1));
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
 
     for(int i = left_slice; i <= right_slice; i++)
     {
         VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i - current_slice));
+        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
         acc    = ADD_OP(acc, MUL_OP(values, values));
     }
 
@@ -112,7 +111,7 @@
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 
-/** Apply in map normalization.
+/** Apply in-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
@@ -137,8 +136,8 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void normalization_layer_in_map_1D(TENSOR3D_DECLARATION(input),
-                                            TENSOR3D_DECLARATION(output))
+__kernel void normalization_layer_in_map(TENSOR3D_DECLARATION(input),
+                                         TENSOR3D_DECLARATION(output))
 {
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
@@ -152,17 +151,34 @@
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
 
-    const int current_pos = get_global_id(0) << 2;
+    const int current_col = get_global_id(0) << 2;
+    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
+    const int right_pos   = min((int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1 - current_col));
 
-    const int left_pos  = max(current_pos - (int)RADIUS, -3);
-    const int right_pos = min(current_pos + (int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1));
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
 
-    for(int i = left_pos; i <= right_pos; i += 1)
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
     {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i - current_pos, 0, 0));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = LOAD_OP(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
     }
+#endif /* defined(IN_MAP_2D) */
 
     acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
index e1131d5..507e85c 100644
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl

@@ -29,9 +29,6 @@
  * - Determinant less than DETERMINANT_THR
  * - or minimum eigenvalue is smaller then EIGENVALUE_THR
  *
- * The thresholds for the determinant and the minimum eigenvalue is
- * defined by the OpenVX spec
- *
  * Note: Also lost tracking happens when the point tracked coordinate is outside
  * the image coordinates
  *
@@ -268,7 +265,7 @@
 
     float4 w;
     w    = round(w_scharr * (float4)D0);
-    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
+    w.s3 = D0 - w.s0 - w.s1 - w.s2;
 
     // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
     int4 iG = (int4)0;
@@ -309,7 +306,7 @@
             // Compute bilinear interpolation for iyval
             old_i.s2 = dot(px, w_scharr);
 
-            // Rounding (it could be omitted. Used just for matching the VX implementation)
+            // Rounding (it could be omitted)
             int4 iold = convert_int4(round(old_i));
 
             // Accumulate values in the Spatial Gradient Matrix
@@ -352,8 +349,8 @@
  * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
  * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
  * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
- * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
+ * @param[in]      term_iteration                          It is set to 1 if termination = TERM_CRITERIA_ITERATIONS
+ * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
  */
 void __kernel lktracker_stage1(
     IMAGE_DECLARATION(new_image),

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 99d7e6e..ee8ff27 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -186,10 +186,14 @@
 DATA_TYPE calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    const int start_x = get_global_id(0) * stride_x - pad_x;
-    const int start_y = get_global_id(1) * stride_y - pad_y;
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
     const int end_x   = min(start_x + pool_size, upper_bound_w);
     const int end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
     return ((end_y - start_y) * (end_x - start_x));
 }
 
@@ -334,10 +338,14 @@
 calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
                      const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    const int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
-    const int  start_y = get_global_id(1) * stride_y - pad_y;
+    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+    int        start_y = get_global_id(1) * stride_y - pad_y;
     const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
     const int  end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max((int4)0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
     return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
 }
 
@@ -367,7 +375,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void pooling_layer_3_optimized(
+__kernel void pooling_layer_optimized_3(
     TENSOR3D_DECLARATION(input),
     TENSOR3D_DECLARATION(output))
 {
@@ -395,103 +403,6 @@
 }
 #endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
 
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_7(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 8);
-    data1 = POW2_OP(data1, 8);
-    data2 = POW2_OP(data2, 8);
-    data3 = POW2_OP(data3, 8);
-    data4 = POW2_OP(data4, 8);
-    data5 = POW2_OP(data5, 8);
-    data6 = POW2_OP(data6, 8);
-#endif /* defined(POOL_L2) */
-
-    // Pool operation of all rows
-    data0 = POOL_OP(data0, data1);
-    data2 = POOL_OP(data2, data3);
-    data4 = POOL_OP(data4, data5);
-    data0 = POOL_OP(data0, data2);
-    data4 = POOL_OP(data4, data6);
-    data0 = POOL_OP(data0, data4);
-
-    // Set last element
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data0.s7 = 0;
-#else  /* defined(POOL_AVG) || defined(POOL_L2) */
-    data0.s7 = data0.s6;
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-    // Reduce result
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    reduce4 = POOL_OP(data0.s0123, data0.s4567);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    reduce2       = POOL_OP(reduce4.s01, reduce4.s23);
-    DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = res;
-}
-
 #if defined(POOL_SIZE)
 
 // Set the initial value for the pooling operation accordingly with the data type
@@ -502,7 +413,6 @@
 #define MIN_VAL_EXPAND(type) type##_MIN
 #define MIN_VAL(type) MIN_VAL_EXPAND(type)
 #define INITIAL_VALUE MIN_VAL(DATA_TYPE)
-#define INITIAL_VALUE 0
 #else // FIXED_POINT_POSITION
 #if FP16
 #define INITIAL_VALUE -HALF_MAX
@@ -515,7 +425,7 @@
 
 /** Performs a pooling function of pool size equal to N
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
  * @note -DFP16 must be passed at compile time if half float data type is used
  * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
  * @note In case of average pooling the following information must be passed at compile time:
@@ -524,7 +434,7 @@
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QS8/QS16/F16/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -601,4 +511,4 @@
     // Store result
     *(__global DATA_TYPE *)output.ptr = res;
 }
-#endif // defined(POOL_SIZE)
\ No newline at end of file
+#endif // defined(POOL_SIZE)

diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
new file mode 100644
index 0000000..39c2c22
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size, upper_bound_w);
+    const int end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N_quantized(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = 0;
+    int  sdata = 0;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE - 8); x += 8)
+        {
+            uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
+            int8 data0  = convert_int8(data);
+            vdata       = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE; ++x)
+        {
+            uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
+            int data0  = convert_int(data);
+            sdata      = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
+    res          = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+    // Store result
+    *(__global uchar *)output.ptr = convert_uchar(res);
+}

diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 0106ce0..a2ae8c4 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl

@@ -49,13 +49,23 @@
 inline const float8 transform_bilinear(const float2 coord, const float2 scale)
 {
     const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
     return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
 }
 
 /** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
  *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -84,12 +94,14 @@
     Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
     const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = clamp_to_border(transform_nearest(get_current_coords(), r), input_width, input_height);
+    const float8 tc  = clamp_to_border_with_size(transform_nearest(get_current_coords(), r), input_width, input_height, BORDER_SIZE);
     vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
 }
 
 /** Performs an affine transformation on an image interpolating with the BILINEAR method.
  *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
  * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
  * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -119,5 +131,5 @@
     Image        out = CONVERT_TO_IMAGE_STRUCT(out);
     const float2 r   = (float2)(scale_x, scale_y);
     const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate(&in, tc, input_width, input_height), 0, (__global DATA_TYPE *)out.ptr);
+    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
 }

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 9b24380..5d8cd12 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl

@@ -29,6 +29,7 @@
 #define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
 #define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
 #define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
+#define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
 #define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
 
@@ -42,6 +43,7 @@
 #define MAX_OP(x, y, type, size) max((x), (y))
 #define ADD_OP(x, y, type, size) ((x) + (y))
 #define SUB_OP(x, y, type, size) ((x) - (y))
+#define MUL_OP(x, y, type, size) ((x) * (y))
 #define DIV_OP(x, y, type, size) ((x) / (y))
 #define EXP_OP(x, type, size) exp((x))
 
@@ -55,8 +57,35 @@
 
 #endif /* FIXED_POINT_POSITION */
 
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+/* Vector size, i.e. number of vector elements. */
+#if VECTOR_SIZE == 2
+__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
+__constant uint2 idx__ = (uint2)(0, 1);
+
+#elif VECTOR_SIZE == 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
+
+#elif VECTOR_SIZE == 8
+__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
+__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+#else /* VECTOR_SIZE DEFAULT */
+#define VECTOR_SIZE 16
+#define LOG_VECTOR_SIZE 4
+__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
+__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+#endif /* VECTOR_SIZE END */
+
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+__constant uint4 idx4   = (uint4)(0, 1, 2, 3);
 
 /** Identifies the maximum value across the 1st dimension.
  *
@@ -128,6 +157,7 @@
  * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
  * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
  * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (if undefined, assume it equals 1.0)
  *
  * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -175,6 +205,12 @@
     Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
     Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
 
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 16))BETA;
+#endif /* BETA */
+
     // Load max value of 1D logits vector (row)
     DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
 
@@ -189,6 +225,9 @@
         VEC_DATA_TYPE(DATA_TYPE, 16)
         data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
         data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
         data = EXP_OP(data, DATA_TYPE, 16);
         vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
@@ -199,6 +238,9 @@
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
     data = SUB_OP(data, max_val, DATA_TYPE, 16);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, 16);
+#endif /* BETA */
     data = EXP_OP(data, DATA_TYPE, 16);
     VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
     widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
@@ -262,3 +304,460 @@
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 }
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
+#endif /* BETA */
+
+    // Initialize local maximum
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
+
+    // Calculate max of row
+    const uint width_ = width >> LOG_VECTOR_SIZE;
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data_max    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
+    widx        = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform max reduction
+#if VECTOR_SIZE == 16
+    max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+    // Store result
+    *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
+
+    /* Second section */
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    sum1D = 0;
+
+    // Shift values, exp and sum
+    for(uint i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+        data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+    }
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
+    data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
+#ifdef BETA
+    data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
+#endif /* BETA */
+    data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
+    widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
+    data = select(0, data, widx);
+    VSTORE(VECTOR_SIZE)
+    (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
+    sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Perform sum reduction
+#if VECTOR_SIZE == 16
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
+#endif /* VECTOR SIZE 16 END */
+#if VECTOR_SIZE >= 8
+    sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
+#endif /* VECTOR SIZE 8 END */
+#if VECTOR_SIZE >= 4
+    sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+#endif /* VECTOR SIZE 4 END */
+    sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+
+    // Calculate and store result
+    *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ * @param[in]  width                              Input image width
+ */
+__kernel void softmax_layer_max_shift_exp_sum_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    const uint lid = get_local_id(0);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
+#endif /* BETA */
+
+    // Define one temporary vector per work-item.
+    __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
+    // Number of elements per work-item.
+    const uint row = width / GRID_SIZE;
+    // Number of iterations per work-item.
+    const uint width_ = row >> 2;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max    = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of 4
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx        = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = max_val_vec;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec     = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
+        max_val_vec.s0  = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
+        max_local       = max_val_vec.s0;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    sum1D             = 0;
+    DATA_TYPE max_val = max_local;
+
+    // Shift values, exp and sum
+    for(i = 0; i < width_; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
+    if(lid < boundary_workitems)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(boundary_workitems == 0)
+    {
+        boundary_workitems = GRID_SIZE;
+        i--;
+    }
+    if(lid == (boundary_workitems - 1))
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
+        data = SUB_OP(data, max_val, DATA_TYPE, 4);
+#ifdef BETA
+        data = MUL_OP(data, beta, DATA_TYPE, 4);
+#endif /* BETA */
+        data = EXP_OP(data, DATA_TYPE, 4);
+        VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
+        widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
+        data = select(0, data, widx);
+        VSTORE(4)
+        (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
+        sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
+        // Perform max reduction
+        sum1D.s01                        = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
+        sum1D.s0                         = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
+        *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
+    }
+}

diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
new file mode 100644
index 0000000..31f402f
--- /dev/null
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl

@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "asymm_helper.h"
+#include "helpers.h"
+
+#define MAX_OP(x, y, type, size) max((x), (y))
+#define ADD_OP(x, y, type, size) ((x) + (y))
+
+__constant uchar16 type_min = 0;
+__constant uint16 idx16     = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_max_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint width)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+
+    // Initialize local maximum
+    uchar16 max_val = 0;
+
+    // Calculate max of row
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        uchar16 data = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
+        max_val      = MAX_OP(data, max_val, uchar, 16);
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    uchar16 data = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
+    uchar16 widx = convert_uchar16(((uint16)(width4 << 4) + idx16) < width);
+    max_val      = MAX_OP(max_val, select(type_min, data, widx), uchar, 16);
+#endif /* NON_MULTIPLE_OF_16 */
+
+    // Perform max reduction
+    max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, uchar, 8);
+    max_val.s0123     = MAX_OP(max_val.s0123, max_val.s4567, uchar, 4);
+    max_val.s01       = MAX_OP(max_val.s01, max_val.s23, uchar, 2);
+    max_val.s0        = MAX_OP(max_val.s0, max_val.s1, uchar, 1);
+
+    // Store result
+    *((__global uchar *)dst.ptr) = max_val.s0;
+}
+
+#if defined(DIFF_MIN)
+
+int16 mult_by_quantized_multiplier(int16 data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[in]  width                             Input image width
+ */
+__kernel void softmax_layer_shift_exp_sum_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(max),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum),
+    uint width)
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    // Load max value of 1D logits vector (row)
+    int max_val = convert_int(*((__global uchar *)offset(&max, 0, 0)));
+
+    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
+    int16 sum1D = 0;
+
+    // Shift values, exp and sum
+    const uint width4 = width >> 4;
+    for(uint i = 0; i < width4; i++)
+    {
+        uchar16 data         = vload16(0, (__global uchar *)offset(&src, i << 4, 0));
+        int16 data_fp        = convert_int16(data);
+        int16 data_diff      = data_fp - max_val;
+        int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+        data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+        vstore16(data_diff, 0, (__global int *)offset(&dst, i << 4, 0));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
+    }
+
+#ifdef NON_MULTIPLE_OF_16
+    // Handle non multiple of 16
+    uchar16 data         = vload16(0, (__global uchar *)offset(&src, width4 << 4, 0));
+    int16 data_fp        = convert_int16(data);
+    int16 data_diff      = data_fp - max_val;
+    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    data_fp              = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+    data_fp              = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
+    int16 widx           = convert_int16(((uint16)(width4 << 4) + idx16) < width);
+    vstore16(data_diff, 0, (__global int *)offset(&dst, width4 << 4, 0));
+    data_fp = select(0, data_fp, data_diff >= (int16)(DIFF_MIN));
+    sum1D   = sum1D + select(0, data_fp, widx);
+#endif /* NON_MULTIPLE_OF_16 */
+
+    // Perform min/max reduction
+    sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, qs16, 8);
+    sum1D.s0123     = ADD_OP(sum1D.s0123, sum1D.s4567, qs16, 4);
+    sum1D.s01       = ADD_OP(sum1D.s01, sum1D.s23, qs16, 2);
+    sum1D.s0        = ADD_OP(sum1D.s0, sum1D.s1, qs16, 1);
+
+    // Calculate and store result
+    *((__global int *)sum.ptr) = sum1D.s0;
+}
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+    // It will be better to calculate this in prev layer and pass here as parameter
+    uint  sum_val_u               = convert_uint(sum_val);
+    int   headroom_plus_one       = clz(sum_val_u);
+    int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+    int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+    int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
+    int16 shifted_scale           = asymm_one_over_one_plus_x_for_x_in_0_1(shifted_sum_minus_one);
+
+    // It was already calculated in prev layer, should be stored into tmp output and reused
+    int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
+    int16 data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    int16 data           = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
+
+    data = asymm_mult(shifted_scale, data);
+    data = asymm_rounding_divide_by_pow2(data, num_bits_over_unit + 31 - 8);
+    data = select(0, data, data_diff >= (int16)(DIFF_MIN));
+    vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));
+}
+
+#endif /* defined(DIFF_MIN) */

diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 26a8b85..86a5e06 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h

@@ -23,6 +23,21 @@
  */
 #include "helpers.h"
 
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
 /** Clamps the given coordinates to the borders.
  *
  * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
@@ -32,9 +47,7 @@
  */
 inline const float8 clamp_to_border(float8 coords, const float width, const float height)
 {
-    const float4 clamped_x = clamp(coords.even, -1.0f, width);
-    const float4 clamped_y = clamp(coords.odd, -1.0f, height);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return clamp_to_border_with_size(coords, width, height, 1);
 }
 
 /** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
@@ -64,7 +77,7 @@
  * @param[in] coord Input coordinates
  *
  * @return vector of 8 floats with the coordinates, even positions are x and odd y.
-*/
+ */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
     return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
@@ -72,23 +85,25 @@
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
  *
- * @param[in] in     Pointer to the source image.
- * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width  Width of the image
- * @param[in] height Height of the image
-*/
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
     const float16 c1 = (float16)(
-                           clamp_to_border(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height),
-                           clamp_to_border(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height));
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
     const float16 c2 = (float16)(
-                           clamp_to_border(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height),
-                           clamp_to_border(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height));
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
     // Loads the values from the input image
     const float16 t = (float16)(
                           /* tl, tr, bl, br */
@@ -109,3 +124,15 @@
                           ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}

diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
index d955e42..d249aa6 100644
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ b/src/core/CL/cl_kernels/warp_perspective.cl

@@ -48,7 +48,7 @@
     const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
     // transform [z,z+1,z+2,z+3]
     const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
-    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with reference implementation
     // transform [x,x+1,x+2,x+3]
     const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
     // transform [y,y+1,y+2,y+3]
@@ -62,7 +62,7 @@
  *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
  *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
  *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
-
+ *
  *   output(x,y) = input(x0/z0,y0/z0)
  *
  * @attention The matrix coefficients need to be passed at compile time:\n

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 18202c1..eecc94f 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -34,12 +34,65 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->data_type() == DataType::QASYMM8) && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
+                                    "For QASYMM8 only lower/upper bounded relu is supported");
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input);
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win,
+                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLActivationLayerKernel::CLActivationLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -47,66 +100,105 @@
 
 void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     if(output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        auto_init_if_empty(*output->info(),
+                           *input->info()->clone());
     }
 
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const DataType     dt                                = input->info()->data_type();
     const int          fixed_point_position              = input->info()->fixed_point_position();
     float              a_const                           = act_info.a();
     float              b_const                           = act_info.b();
-    if(is_data_type_fixed_point(input->info()->data_type()))
+    int                a_const_int                       = 0;
+    int                b_const_int                       = 0;
+
+    // Create quantized version of constants a, b if needed
+    if(is_data_type_quantized(dt))
     {
-        a_const = static_cast<int>(lround(a_const * (1 << fixed_point_position)));
-        b_const = static_cast<int>(lround(b_const * (1 << fixed_point_position)));
+        if(is_data_type_fixed_point(dt))
+        {
+            a_const_int = static_cast<int>(lround(a_const * (1 << fixed_point_position)));
+            b_const_int = static_cast<int>(lround(b_const * (1 << fixed_point_position)));
+        }
+        else
+        {
+            a_const_int = input->info()->quantization_info().quantize(a_const, RoundingPolicy::TO_NEAREST_UP);
+            b_const_int = input->info()->quantization_info().quantize(b_const, RoundingPolicy::TO_NEAREST_UP);
+        }
     }
 
     // Set build options
     std::set<std::string> build_opts;
     build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const)));
-    build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const)));
+
+    if(is_data_type_quantized(dt))
+    {
+        build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+        build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+
+        // Set scale and offset of the input and output
+        if(is_data_type_quantized_asymmetric(dt))
+        {
+            float s1 = input->info()->quantization_info().scale;
+            int   o1 = input->info()->quantization_info().offset;
+            // If output is nullptr, assume same quantization scale/offset as input
+            float s2 = output != nullptr ? output->info()->quantization_info().scale : s1;
+            int   o2 = output != nullptr ? output->info()->quantization_info().offset : o1;
+            build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+            build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+            build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+            build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+        }
+    }
+    else
+    {
+        build_opts.emplace(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+        build_opts.emplace(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+    }
+
     build_opts.emplace(output == nullptr ? "-DIN_PLACE" : "");
-    if(is_data_type_fixed_point(input->info()->data_type()))
+    if(is_data_type_fixed_point(dt))
     {
         build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fixed_point_position)));
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     // Make sure _kernel is initialized before calling the parent's configure
-
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        update_window_and_padding(win,
-                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    }
+    // Set config_id for enabling LWS tuning
+    _config_id = "activation_layer_";
+    _config_id += lower_string(string_from_data_type(dt));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
 
-    ICLKernel::configure(win);
+Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+
+    return Status{};
 }
 
 void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -125,7 +217,7 @@
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 65422c2..2789573 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp

@@ -41,6 +41,51 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
+    // Validate in case of configured output
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                        "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
@@ -68,17 +113,7 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
 
     _input1 = input1;
     _input2 = input2;
@@ -101,22 +136,17 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
 
     // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index c5183af..cc2ef1f 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp

@@ -38,6 +38,50 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+
+    // Validate in case of configured output
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                        "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
@@ -61,17 +105,7 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
 
     _input1 = input1;
     _input2 = input2;
@@ -94,21 +128,17 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
 
     // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 43f39f4..663b044 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -37,6 +37,61 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const ITensorInfo *mean, const ITensorInfo *var,
+                          const ITensorInfo *beta, const ITensorInfo *gamma,
+                          float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    if(output != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output, *input->clone());
+    }
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = false;
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
 {
@@ -45,7 +100,7 @@
 void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
                                                 float epsilon)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
 
     _input   = input;
     _output  = output;
@@ -57,21 +112,13 @@
 
     if(output != nullptr)
     {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input->info(), output->info());
         // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
+        auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
+                                                  mean->info(), var->info(), beta->info(), gamma->info(), epsilon));
 
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
@@ -89,23 +136,25 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
 
     // Set kernel static arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    unsigned int include_output = (output != nullptr) ? 1 : 0;
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        update_window_and_padding(win, input_access);
-    }
-    ICLKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                                 const ITensorInfo *mean, const ITensorInfo *var,
+                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
+                                                 float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+
+    return Status{};
 }
 
 void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -118,7 +167,8 @@
     Window vector_slice = window.first_slice_window_1D();
     vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    unsigned int include_output = (_output != nullptr) ? 1 : 0;
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
     add_1D_tensor_argument(idx, _beta, vector_slice);

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index c7884e3..499e1e8 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -43,7 +43,7 @@
 
 void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape output_shape = input->info()->tensor_shape();
@@ -52,7 +52,7 @@
     output_shape.set(2, input->info()->tensor_shape()[0]);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -62,15 +62,30 @@
     _output         = output;
     _convolved_dims = convolved_dims;
 
-    // Create kernel
-    std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    build_opts.emplace("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
+    const DataType data_type = input->info()->data_type();
 
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning over 30 representative tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        if((_convolved_dims.first == 7) || (_convolved_dims.first == 14))
+        {
+            _lws_hint = cl::NDRange(1, 7, 1);
+        }
+        else
+        {
+            _lws_hint = cl::NDRange(1, 8, 1);
+        }
+    }
 
     // Configure window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -81,6 +96,18 @@
     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "col2im_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -103,7 +130,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(collapsed_window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
similarity index 93%
rename from src/core/CL/kernels/CLDepthConcatenateKernel.cpp
rename to src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index edfbf82..0275d4f 100644
--- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -41,17 +41,17 @@
 
 using namespace arm_compute;
 
-CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
     : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
-BorderSize CLDepthConcatenateKernel::border_size() const
+BorderSize CLDepthConcatenateLayerKernel::border_size() const
 {
     return BorderSize(_top_bottom, _left_right);
 }
 
-void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
 {
     static std::map<int, std::pair<std::string, int>> configs_map =
     {
@@ -108,7 +108,7 @@
     ICLKernel::configure(win);
 }
 
-void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);

diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
similarity index 93%
rename from src/core/CL/kernels/CLDepthConvertKernel.cpp
rename to src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index c43884a..83908a1 100644
--- a/src/core/CL/kernels/CLDepthConvertKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -38,7 +38,7 @@
 
 using namespace arm_compute;
 
-void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S16, DataType::QS16,
                                                   DataType::U16, DataType::U32, DataType::S32, DataType::F32);
@@ -90,7 +90,8 @@
     if(input_size > output_size)
     {
         kernel_name += "_down";
-        build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE");
+        // Down conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+        build_opts.insert(((policy == ConvertPolicy::WRAP) && !is_data_type_float(input->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
     }
     else
     {

diff --git a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
deleted file mode 100644
index 6e56835..0000000
--- a/src/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.cpp
+++ /dev/null

@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-
-using namespace arm_compute;
-
-CLDepthwiseConvolution3x3Kernel::CLDepthwiseConvolution3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0)
-{
-}
-
-BorderSize CLDepthwiseConvolution3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void CLDepthwiseConvolution3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
-                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
-                                                                              conv_info);
-
-    ARM_COMPUTE_UNUSED(expected_output);
-    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
-    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
-
-    _input         = input;
-    _output        = output;
-    _weights       = weights;
-    _conv_stride_x = conv_info.stride().first;
-    _conv_stride_y = conv_info.stride().second;
-    _conv_pad_x    = conv_info.pad().first;
-    _conv_pad_y    = conv_info.pad().second;
-    _border_size   = BorderSize(_conv_pad_y, _conv_pad_x);
-
-    // Set build options
-    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
-    std::set<std::string> options{ "-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x) };
-
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_convolution_3x3", options));
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 2;
-    const unsigned int num_elems_written_per_iteration   = 2;
-    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
-    const unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
-}
-
-void CLDepthwiseConvolution3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice_in      = window.first_slice_window_3D();
-    Window slice_out     = window.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-
-    slice_in.adjust(Window::DimX, -_conv_pad_x, true);
-    slice_in.adjust(Window::DimY, -_conv_pad_y, true);
-    slice_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    slice_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        add_3D_tensor_argument(idx, _weights, slice_weights);
-
-        enqueue(queue, *this, slice_out);
-    }
-    while(window.slide_window_slice_3D(slice_out));
-}

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
new file mode 100644
index 0000000..ddc3a2d
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Calculates expected output shape dimension
+ *
+ * @param[in] Input shape
+ *
+ * @return Expected output shape
+ */
+TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+    TensorShape output_shape = input_shape;
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+
+    return output_shape;
+}
+} // namespace
+
+CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
+{
+}
+
+BorderSize CLDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(weights->info()->data_type()))
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Get convolved dimensions
+    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input         = input;
+    _output        = output;
+    _weights       = weights;
+    _biases        = biases;
+    _conv_stride_x = conv_info.stride().first;
+    _conv_stride_y = conv_info.stride().second;
+    _conv_pad_left = conv_info.pad_left();
+    _conv_pad_top  = conv_info.pad_top();
+    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+
+    // Set build options
+    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
+    build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
+
+    // Create kernel
+    std::string kernel_name = is_data_type_quantized_asymmetric(_input->info()->data_type()) ? "depthwise_convolution_3x3_quantized" : "depthwise_convolution_3x3";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments
+    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
+    {
+        float multiplier        = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+
+        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0);
+
+        _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
+        _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
+        _kernel.setArg(idx++, _output->info()->quantization_info().offset);
+        _kernel.setArg(idx++, output_multiplier);
+        _kernel.setArg(idx++, output_shift);
+    }
+
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        // Assume uniform padding and striding.
+        const size_t pad    = _conv_pad_left;
+        const size_t stride = _conv_stride_x;
+        const size_t width  = input->info()->dimension(0);
+        if(pad == 1)
+        {
+            const size_t width_by_stride = width / stride;
+            if(width_by_stride == 28) // 56/2 or 28/1
+            {
+                _lws_hint = cl::NDRange(7, 4, 3);
+            }
+            else if(width_by_stride == 14) // 28/2 or 14/1
+            {
+                _lws_hint = cl::NDRange(7, 7, 4);
+            }
+        }
+        else if(pad == 0)
+        {
+            if(width >= 56) // 56 or 112
+            {
+                _lws_hint = cl::NDRange(8, 5, 2);
+            }
+            else if(width >= 14) // 14 or 28
+            {
+                _lws_hint = cl::NDRange(1, 5, 2);
+            }
+            else // 7
+            {
+                _lws_hint = cl::NDRange(1, 1, 2);
+            }
+        }
+    }
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 2;
+    const unsigned int num_elems_written_per_iteration   = 2;
+    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
+    const unsigned int num_rows_read_per_iteration       = 3;
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration, _conv_stride_x, _conv_stride_y);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    AccessWindowStatic     weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Create input window and adjust
+    Window win_in = window;
+    win_in.adjust(Window::DimX, -_conv_pad_left, true);
+    win_in.adjust(Window::DimY, -_conv_pad_top, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in      = win_in.first_slice_window_3D();
+    Window slice_out     = window.first_slice_window_3D();
+    Window slice_weights = window.first_slice_window_3D();
+    slice_weights.set_dimension_step(Window::DimX, 0);
+    slice_weights.set_dimension_step(Window::DimY, 0);
+
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        add_3D_tensor_argument(idx, _weights, slice_weights);
+
+        enqueue(queue, *this, slice_out, _lws_hint);
+    }
+    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
index 0eaadb8..ad9ac0e 100644
--- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include <tuple>
 
 using namespace arm_compute;
@@ -41,13 +42,13 @@
 {
 }
 
-void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info)
+void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
 
     _input  = input;
     _output = output;
@@ -58,15 +59,28 @@
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
     build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-    build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
-    build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
+    build_opts.emplace("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.emplace("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+    build_opts.emplace("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+    build_opts.emplace("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
     build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
     build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-
+    if(has_bias)
+    {
+        build_opts.emplace("-DHAS_BIAS");
+    }
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_im2col", build_opts));
 
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        _lws_hint = cl::NDRange(1, 2, 1);
+    }
+
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     // The CLDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -99,7 +113,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index 2086b1d..dc47bb0 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp

@@ -42,6 +42,17 @@
 void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTensor *output, size_t conv_w, size_t conv_h)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, conv_w);
+    output_shape.set(1, conv_h);
+    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 

diff --git a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
index 68de68b..81dd6b4 100644
--- a/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.cpp

@@ -35,19 +35,28 @@
 using namespace arm_compute;
 
 CLDepthwiseWeightsReshapeKernel::CLDepthwiseWeightsReshapeKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _biases(nullptr), _output(nullptr)
 {
 }
 
-void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLDepthwiseWeightsReshapeKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * input->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
 
     _input  = input;
+    _biases = biases;
     _output = output;
 
     // Create kernel
@@ -55,6 +64,10 @@
 
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    if(_biases != nullptr)
+    {
+        build_opts.emplace("-DHAS_BIAS");
+    }
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("depthwise_weights_reshape", build_opts));
 
@@ -84,6 +97,15 @@
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, slice_biases);
+    }
+
     do
     {
         unsigned int idx = 0;

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 4224d9b..4f75311 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -34,94 +34,110 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
-    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_pad_x(0), _conv_pad_y(0), _conv_stride_x(0), _conv_stride_y(0)
+namespace
 {
+/** Calculates expected output shape dimension
+ *
+ * @param[in] Input shape
+ *
+ * @return Expected output shape
+ */
+TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+    TensorShape output_shape = input_shape;
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+    output_shape.set(2, weights_shape[3]);
+
+    return output_shape;
 }
 
-BorderSize CLDirectConvolutionLayerKernel::border_size() const
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
-    return _border_size;
-}
-
-void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != weights->info()->dimension(1),
-                             "Weights should have same width as length");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5,
-                             "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(0) == 3 || weights->info()->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3 convolution.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
+                                    "Weights should have same width as length");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 1 && weights->dimension(0) != 3 && weights->dimension(0) != 5,
+                                    "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(2) != input->dimension(2),
+                                    "Weights feature map dimension should match the respective input's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != weights->dimension(1),
+                                    "Only rectangular weights are supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4,
+                                    "Weights can be at most 4 dimensional");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                    "Strides larger than 3 not supported for 1x1 convolution.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(0) == 3 || weights->dimension(0) == 5) && std::get<0>(conv_info.stride()) > 2,
+                                    "Strides larger than 2 not supported for 3x3 convolution.");
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
     }
 
-    const unsigned int kernel_size = weights->info()->dimension(0);
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
+{
+    const unsigned int kernel_size = weights->dimension(0);
+    const DataType     data_type   = input->data_type();
 
     // Get convolved dimensions
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights->info()->dimension(3));
+    TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output, output_shape,
+                       1,
+                       input->data_type(),
+                       input->fixed_point_position(),
+                       input->quantization_info());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    unsigned int conv_stride_x   = std::get<0>(conv_info.stride());
+    unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+    unsigned int conv_pad_left   = std::max(conv_info.pad_left(), kernel_size / 2);
+    unsigned int conv_pad_top    = std::max(conv_info.pad_top(), kernel_size / 2);
+    unsigned int conv_pad_right  = std::max(conv_info.pad_right(), kernel_size / 2);
+    unsigned int conv_pad_bottom = std::max(conv_info.pad_bottom(), kernel_size / 2);
 
-    _conv_stride_x = std::get<0>(conv_info.stride());
-    _conv_stride_y = std::get<1>(conv_info.stride());
-    _conv_pad_x    = std::min(std::get<0>(conv_info.pad()), kernel_size / 2);
-    _conv_pad_y    = std::min(std::get<1>(conv_info.pad()), kernel_size / 2);
+    unsigned int num_elems_read_per_iteration_x    = 0;
+    unsigned int num_elems_read_per_iteration_y    = 0;
+    unsigned int num_elems_written_per_iteration_x = 0;
+    unsigned int num_elems_written_per_iteration_y = 0;
 
-    _input       = input;
-    _weights     = weights;
-    _output      = output;
-    _biases      = biases;
-    _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
-
-    std::set<std::string> options;
-
-    const GPUTarget gpu_target = get_arch_from_target(get_target());
-
-    if(_biases != nullptr)
+    if((target == GPUTarget::BIFROST) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32))
     {
-        options.emplace("-DHAS_BIAS");
-    }
-
-    if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (input->info()->data_type() == DataType::F32))
-    {
-        options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
-
-        std::string kernel_name = "direct_convolution" + support::cpp11::to_string(kernel_size) + "x" + support::cpp11::to_string(kernel_size) + "_f32_bifrost";
-        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
-
         // Configure kernel window
-        Window win = calculate_max_window(*output->info());
-
-        unsigned int num_elems_read_per_iteration_x    = 0;
-        unsigned int num_elems_read_per_iteration_y    = 0;
-        unsigned int num_elems_written_per_iteration_x = 0;
-        unsigned int num_elems_written_per_iteration_y = 0;
 
         switch(kernel_size)
         {
@@ -154,92 +170,291 @@
                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
             }
         }
-
-        // Calculate right and bottom border
-        const int input_width  = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
-        const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
-
-        // Create window and update padding
-        win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowStatic    input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
-        AccessWindowStatic    weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
-        update_window_and_padding(win, input_access, weights_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
     }
     else
     {
-        std::stringstream kernel_name;
-        kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-        DataType promoted_type = input->info()->data_type();
-
-        options.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        options.emplace("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-        options.emplace("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)));
-        options.emplace("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
-
-        if(is_data_type_fixed_point(input->info()->data_type()))
+        num_elems_read_per_iteration_y    = kernel_size;
+        num_elems_written_per_iteration_x = 8;
+        num_elems_written_per_iteration_y = 1;
+        switch(kernel_size)
         {
-            options.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+            case 1:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 8;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 16;
+                        break;
+                    case 3:
+                        switch(input->element_size())
+                        {
+                            case 1:
+                                num_elems_read_per_iteration_x = 28;
+                                break;
+                            case 2:
+                                num_elems_read_per_iteration_x = 24;
+                                break;
+                            case 4:
+                                num_elems_read_per_iteration_x = 22;
+                                break;
+                            default:
+                                ARM_COMPUTE_ERROR("Invalid data size");
+                        }
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            case 3:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 10;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 17;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            case 5:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_x = 12;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_x = 20;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid direct convolution size");
+        }
+    }
 
-            switch(input->info()->data_type())
+    // Calculate right and bottom border
+    int input_width  = input->dimension(0) + conv_pad_left + conv_pad_right;
+    int input_height = input->dimension(1) + conv_pad_top + conv_pad_bottom;
+
+    // Add padding only if necessary or it would always result in a window_changed
+    input_width  = ceil_to_multiple(input_width, num_elems_read_per_iteration_x);
+    input_height = ceil_to_multiple(input_height, num_elems_read_per_iteration_y);
+
+    // Create window and update padding
+    bool   window_changed = false;
+    Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
+
+    AccessWindowStatic    input_access(input, -conv_pad_left, -conv_pad_top, input_width, input_height);
+    AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
+
+    window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
+    : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+{
+}
+
+BorderSize CLDirectConvolutionLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    const unsigned int kernel_size = weights->info()->dimension(0);
+    const DataType     data_type   = input->info()->data_type();
+
+    // Get convolved dimensions
+    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  weights->info(),
+                                                  (biases != nullptr) ? biases->info() : nullptr,
+                                                  output->info(),
+                                                  conv_info));
+
+    _conv_stride_x = std::get<0>(conv_info.stride());
+    _conv_stride_y = std::get<1>(conv_info.stride());
+
+    _input   = input;
+    _weights = weights;
+    _output  = output;
+    _biases  = biases;
+
+    int conv_pad_left   = std::min(conv_info.pad_left(), kernel_size / 2);
+    int conv_pad_top    = std::min(conv_info.pad_top(), kernel_size / 2);
+    int conv_pad_right  = std::min(conv_info.pad_right(), kernel_size / 2);
+    int conv_pad_bottom = std::min(conv_info.pad_bottom(), kernel_size / 2);
+    _border_size        = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
+
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+
+    std::stringstream kernel_name;
+    kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+    CLBuildOptions build_options;
+    build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
+
+    if((gpu_target == GPUTarget::BIFROST) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32))
+    {
+        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
+
+        kernel_name << "_f32_bifrost";
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), build_options.options()));
+
+        // Through extensive experimentation with over 30 representative tensor
+        // shapes, we found a small number of local work size configurations
+        // that result in nearly optimal execution times. Selecting the right
+        // lws for a given shape, however, required a complex decision tree,
+        // until we constructed a simple feature as described below.
+        //
+        // We started from the number of multiply-accumulate operations for a
+        // convolution layer, which is equal to the product of the input
+        // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
+        // this resulted in ties between distinct shapes that required distinct
+        // lws configurations. Replacing the width of the input with the kernel
+        // size, however, resulted in nearly optimal predictions. We use underscores
+        // in variable names to indicate when they are intentionally misleading.
+        const size_t product_of_weights_dimensions = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2);
+        const size_t product_of_input_dimensions_  = input->info()->dimension(0) * weights->info()->dimension(1) * input->info()->dimension(2);
+        const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
+
+        switch(kernel_size)
+        {
+            case 1:
             {
-                case DataType::QS8:
-                    promoted_type = DataType::QS16;
-                    break;
-                case DataType::QS16:
-                    promoted_type = DataType::QS32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Datatype not supported");
+                if(mega_ops_ < 1.f)
+                {
+                    _lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 7.f)
+                {
+                    _lws_hint = cl::NDRange(1, 1, 4);
+                }
+                else
+                {
+                    _lws_hint = cl::NDRange(1, 1, 2);
+                }
+                break;
+            }
+            case 3:
+            {
+                if(mega_ops_ < 1.f)
+                {
+                    _lws_hint = cl::NDRange(1, 1, 8);
+                }
+                else if(mega_ops_ < 13.f)
+                {
+                    _lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else if(mega_ops_ < 50.f)
+                {
+                    _lws_hint = cl::NDRange(3, 1, 4);
+                }
+                else
+                {
+                    _lws_hint = cl::NDRange(2, 1, 6);
+                }
+                break;
+            }
+            case 5:
+            {
+                if(mega_ops_ < 2.f || mega_ops_ > 80.f)
+                {
+                    _lws_hint = cl::NDRange(2, 1, 4);
+                }
+                else
+                {
+                    _lws_hint = cl::NDRange(2, 1, 8);
+                }
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
             }
         }
+    }
+    else
+    {
+        bool     is_quantized_fixed_point = is_data_type_fixed_point(data_type);
+        bool     is_quantized_asymm       = is_data_type_quantized_asymmetric(data_type);
+        DataType promoted_type            = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
 
-        options.emplace("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type));
+        build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
+        build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2))));
+        build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
+        build_options.add_option_if(is_quantized_fixed_point,
+                                    std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+        build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(promoted_type)));
 
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+        // Create kernel
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(is_quantized_asymm ? "direct_convolution_1x1_3x3_5x5_quantized" : kernel_name.str(),
+                                                                               build_options.options()));
+    }
 
-        // Configure kernel window
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
-        bool is_stride2 = ((kernel_size != 1) && (_conv_stride_x == 2));
+    // Set static kernel arguments
+    if(is_data_type_quantized_asymmetric(data_type))
+    {
+        int output_multiplier = 0;
+        int output_shift      = 0;
 
-        const unsigned int num_elems_read_per_iteration_x    = 8 + 2 * (kernel_size / 2) + (is_stride2 ? 6 + kernel_size / 2 : 0);
-        const unsigned int num_elems_read_per_iteration_y    = kernel_size;
-        const unsigned int num_elems_written_per_iteration_x = 8;
-        const unsigned int num_elems_written_per_iteration_y = 1;
+        float multiplier = _input->info()->quantization_info().scale * _weights->info()->quantization_info().scale / _output->info()->quantization_info().scale;
+        ARM_COMPUTE_THROW_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 
-        // Calculate right and bottom border
-        const int input_width  = input->info()->dimension(0) - kernel_size / 2 + _conv_pad_x;
-        const int input_height = input->info()->dimension(1) - kernel_size / 2 + _conv_pad_y;
-
-        // Create window and update padding
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
-
-        AccessWindowStatic    input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + num_elems_read_per_iteration_x, input_height + num_elems_read_per_iteration_y);
-        AccessWindowStatic    weights_access(weights->info(), 0, 0, kernel_size, kernel_size);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
-
-        update_window_and_padding(win, input_access, weights_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
+        unsigned int idx = 3 * num_arguments_per_3D_tensor() + ((_biases != nullptr) ? num_arguments_per_1D_tensor() : 0) + 1;
+        _kernel.setArg(idx++, -_input->info()->quantization_info().offset);
+        _kernel.setArg(idx++, -_weights->info()->quantization_info().offset);
+        _kernel.setArg(idx++, _output->info()->quantization_info().offset);
+        _kernel.setArg(idx++, output_multiplier);
+        _kernel.setArg(idx++, output_shift);
     }
 
     // Set config_id for enabling LWS tuning
     _config_id = "direct_convolution_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
     _config_id += support::cpp11::to_string(kernel_size);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(_conv_pad_x);
+    _config_id += support::cpp11::to_string(conv_pad_left);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(_conv_pad_y);
+    _config_id += support::cpp11::to_string(conv_pad_top);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_pad_right);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_pad_bottom);
     _config_id += "_";
     _config_id += support::cpp11::to_string(_conv_stride_x);
     _config_id += "_";
@@ -250,6 +465,15 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
+Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                const GPUTarget target)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);
+
+    return Status{};
+}
+
 void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -259,8 +483,8 @@
     Window slice  = window.first_slice_window_3D();
     Window win_in = window;
 
-    win_in.adjust(Window::DimX, -_conv_pad_x, true);
-    win_in.adjust(Window::DimY, -_conv_pad_y, true);
+    win_in.adjust(Window::DimX, -_border_size.left, true);
+    win_in.adjust(Window::DimY, -_border_size.top, true);
     win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
     win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
 

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 2e066c7..66504e6 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -122,6 +122,7 @@
         switch(dt)
         {
             case DataType::U8:
+            case DataType::QASYMM8:
                 set_constant_border<uint8_t>(idx, constant_border_value);
                 break;
             case DataType::QS8:

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 268260b..7741f12 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -43,8 +43,10 @@
 
 void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::U16, DataType::S16, DataType::QS16,
+                                                  DataType::U32, DataType::S32,
+                                                  DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape output_shape = input->info()->tensor_shape();
@@ -52,7 +54,7 @@
     output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -62,9 +64,8 @@
     _output = output;
 
     // Create kernel
-    std::string data_type_name;
-    data_type_name = support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
-    _kernel        = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
+    std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
 
     // Configure kernel window
     const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
@@ -117,7 +118,7 @@
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input, in_slice);
         add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
     }
     while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index ef572cf..1d9fe4b 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -51,45 +51,85 @@
 {
 }
 
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
-    // Create kernel and set static arguments
-    std::set<std::string> build_opts = { ("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))) };
-    _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_u8", build_opts));
-    unsigned int idx                 = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg<int32_t>(idx++, a_offset);
-    _kernel.setArg<int32_t>(idx++, b_offset);
-    _kernel.setArg<int32_t>(idx++, output_offset);
-    _kernel.setArg<int32_t>(idx++, output_mult_int);
-    _kernel.setArg<int32_t>(idx++, shift);
+    CLBuildOptions build_opts;
 
-    // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
-    constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+    if(is_interleaved_transposed)
+    {
+        // Create kernel and set static arguments
+        build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options()));
 
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        // Configure window
+        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
 
-    AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
-    AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    update_window_and_padding(win, input0_access, input1_access, output_access);
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+        update_window_and_padding(win, input0_access, input1_access, output_access);
 
-    ICLKernel::configure(win);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+    else
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+        const unsigned int     num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+
+        build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options()));
+
+        // Configure window
+        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+
+        ICLKernel::configure(win);
+    }
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemmlowp_";
+    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
 void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -117,7 +157,7 @@
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_2D(slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000..2877a74
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
+
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+
+        // Validate batches
+        TensorShape output_shape = mm_result->info()->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(2);
+
+            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
+                                         && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                         "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+
+        build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    }
+
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, mm_result_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
+        update_window_and_padding(win, vector_sum_col_access);
+    }
+
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
+        update_window_and_padding(win, vector_sum_row_access);
+    }
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _mm_result, slice);
+        if(_vector_sum_col != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+        }
+        if(_vector_sum_row != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+        }
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000..ff2fc64
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, output_result_access);
+
+        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                          int result_offset_after_shift, int min, int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  min,
+                                                  max));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(result_offset_after_shift));
+    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(result_fixedpoint_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(_bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, _bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
new file mode 100644
index 0000000..151a658
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp

@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, output_result_access);
+
+        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min,
+                                                              int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  min,
+                                                  max));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(result_offset));
+    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(result_mult_int));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(result_shift));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(_bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, _bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
new file mode 100644
index 0000000..bcf04b0
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp

@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
+    : _input(), _output()
+{
+}
+
+void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+    _input  = mtx_a;
+    _output = vector_sum_row;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice_in  = collapsed.first_slice_window_2D();
+    Window slice_out = collapsed.first_slice_window_2D();
+
+    // Setup input slice. Its dimensions are increased in the cl kernel.
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}
+
+void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+
+    _input  = mtx_b;
+    _output = vector_sum_col;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(0)));
+    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->info()->dimension(1)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options()));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), num_elems_processed_per_iteration), _input->info()->dimension(1));
+    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              input_access,
+                              output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window slice_out = collapsed.first_slice_window_2D();
+    Window slice_in  = slice_out;
+
+    slice_in.set(Window::DimY, Window::Dimension(0, 1, 1));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(collapsed.slide_window_slice_2D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 263cfab..015b4f7 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp

@@ -51,18 +51,23 @@
     _biases = biases;
     _accum  = accum;
 
-    std::set<std::string> build_opts;
-    build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type())));
-    if(is_data_type_fixed_point(accum->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(accum->info()->fixed_point_position()));
-    }
+    // Get the target architecture
+    GPUTarget arch_target = get_arch_from_target(get_target());
+    // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+    const unsigned int vector_size = (arch_target == GPUTarget::BIFROST) ? 8 : 16;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option_if(is_data_type_fixed_point(accum->info()->data_type()),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(accum->info()->fixed_point_position()));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int num_elems_processed_per_iteration = vector_size;
 
     Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
 
@@ -92,7 +97,7 @@
         add_2D_tensor_argument(idx, _accum, accum_slice);
         add_1D_tensor_argument(idx, _biases, biases_slice);
 
-        enqueue(queue, *this, accum_slice);
+        enqueue(queue, *this, accum_slice, _lws_hint);
     }
     while(window.slide_window_slice_2D(accum_slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index b184c50..16706dd 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -38,7 +38,6 @@
 #include "arm_compute/core/Window.h"
 
 #include <set>
-#include <sstream>
 #include <string>
 
 using namespace arm_compute;
@@ -53,7 +52,6 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
@@ -63,7 +61,19 @@
     _input1 = input1;
     _output = output;
 
-    if(output->info()->dimension(1) == 196)
+    const DataType data_type = input0->info()->data_type();
+    const int      fp_pos    = input0->info()->fixed_point_position();
+
+    // Get target architecture
+    GPUTarget arch_target = get_arch_from_target(get_target());
+
+    // Configure LWS hint
+    if(arch_target == GPUTarget::BIFROST && input1->info()->dimension(1) == 24)
+    {
+        // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
+        _lws_hint = cl::NDRange(2, 2);
+    }
+    else if(output->info()->dimension(1) == 196)
     {
         _lws_hint = cl::NDRange(1, 7);
     }
@@ -72,40 +82,35 @@
         _lws_hint = cl::NDRange(8, 8);
     }
 
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))));
-    build_opts.emplace(("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))));
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
 
-    if(is_data_type_fixed_point(input0->info()->data_type()))
-    {
-        build_opts.emplace(("-DALPHA=" + support::cpp11::to_string((input0->info()->data_type() == DataType::QS8 ?
-                                                                    sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
-                                                                    sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position())))));
+    const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f;
 
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input0->info()->fixed_point_position())));
-    }
-    else
+    // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
+    if(multiply_alpha)
     {
-        build_opts.emplace(("-DALPHA=" + float_to_string_with_full_precision(alpha)));
+        build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
+                                      "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
+                                      "-DALPHA=" + float_to_string_with_full_precision(alpha));
     }
 
+    std::string kernel_name;
     if(is_interleaved_transposed)
     {
-        // Create kernel
-        std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
-
-        if(data_type_name == "f32")
+        build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        if(data_type == DataType::F32)
         {
-            GPUTarget arch_target = get_arch_from_target(get_target());
-            _kernel               = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target), build_opts));
+            kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
         }
         else
         {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_" + data_type_name, build_opts));
+            kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
         }
 
-        // Configure window kernel
-        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        // Configure kernel window
+        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
         constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
         Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
@@ -122,28 +127,47 @@
     }
     else // The input tensors have not been reshaped
     {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+        build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
 
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
-        const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
+        unsigned int       num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
         const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
 
-        build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type())));
-        build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)));
-        build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)));
-
-        // Create kernel
-        if(is_data_type_fixed_point(input0->info()->data_type()))
+        // Create kernels according to the architecture, data type and input size.
+        if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
         {
-            std::string kernel_name = "gemm_mm_" + lower_string(string_from_data_type(input0->info()->data_type()));
-            _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+            // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+            // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+            // FC6 and FC7 of AlexNet and VGG-16).
+            if(input1->info()->dimension(0) <= 1000)
+            {
+                // Each work-item processes 2 elements in the X dimension.
+                num_elems_processed_per_iteration_x = 2;
+                kernel_name                         = "gemm_mm_floating_point_f32_bifrost_1000";
+            }
+            else
+            {
+                // Each work-item processes 4 elements in the X dimension (as in the default case).
+                num_elems_processed_per_iteration_x = 4;
+                kernel_name                         = "gemm_mm_floating_point_f32_bifrost";
+            }
+            // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
+            // via exhaustive autotuning over a range of representative layer configurations.
+            _lws_hint = cl::NDRange(4);
         }
-        else
+        else if(is_data_type_fixed_point(data_type))
         {
-            std::string kernel_name = "gemm_mm_floating_point";
-            _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts));
+            kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type));
         }
+        else // (MIDGARD and F32) or (F16)
+        {
+            build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+            kernel_name = "gemm_mm_floating_point";
+        }
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
 
+        // Configure window
         Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
@@ -157,18 +181,21 @@
         output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
 
         ICLKernel::configure(win);
-
-        // Set config_id for enabling LWS tuning
-        _config_id = "gemm_";
-        _config_id += (is_interleaved_transposed ? "reshaped_" : "");
-        _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-        _config_id += "_";
-        _config_id += support::cpp11::to_string(output->info()->dimension(1));
-        _config_id += "_";
-        _config_id += support::cpp11::to_string(output->info()->dimension(0));
-        _config_id += "_";
-        _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
     }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemm_";
+    _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+    _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
 void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 70af5d6..951bc14 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -63,6 +63,14 @@
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mv", build_opts));
 
+    // Configure the local work size for Bifrost with a value obtained
+    // via exhaustive autotuning for the MobileNets tensor shapes.
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        _lws_hint = cl::NDRange(1, 1, 1);
+    }
+
     // Configure kernel window
     const unsigned int num_elems_read_per_iteration = 4;
 
@@ -119,7 +127,7 @@
         unsigned int idx_2 = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
         add_3D_tensor_argument(idx_0, _input0, slice_in);
         add_1D_tensor_argument(idx_2, _output, slice_out);
-        enqueue(queue, *this, slice_in);
+        enqueue(queue, *this, slice_in, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
 }

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5057c8f..35074f9 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -40,8 +40,9 @@
 
 void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::U16, DataType::S16, DataType::QS16,
+                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
@@ -50,7 +51,7 @@
     output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);

diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index 87ee5fb..fa39ce6 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp

@@ -115,18 +115,19 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    if(_input->info()->dimension(0) < pixels_per_item)
-    {
-        return;
-    }
-
     _output->map(queue, true);
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     memset(_output->buffer(), 0, _output->size());
     _output->unmap(queue);
 
-    Window      slice = window.first_slice_window_2D();
-    cl::NDRange lws   = cl::NDRange(local_x_size, 1);
+    if(_input->info()->dimension(0) < pixels_per_item)
+    {
+        return;
+    }
+
+    Window             slice = window.first_slice_window_2D();
+    const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step();
+    cl::NDRange        lws   = (local_x_size < gws_x) ? cl::NDRange(local_x_size, 1) : cl::NDRange(1, 1);
 
     do
     {

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 98a799f..6514d6c 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -46,36 +46,34 @@
 
 void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     _input  = input;
     _output = output;
 
+    const DataType  data_type  = input->info()->data_type();
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+
     // Create kernel
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+    build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
-
-    int pad_x    = 0;
-    int pad_y    = 0;
     int stride_x = 0;
     int stride_y = 0;
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+
     std::tie(stride_x, stride_y) = conv_info.stride();
 
     const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                     input->info()->tensor_shape().cend(),
                                                     output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
+    std::string kernel_name = "im2col_generic";
     if(!run_img2col_reduced)
     {
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
@@ -83,35 +81,87 @@
                                             conv_info);
         _num_elems_processed_per_iteration = output->info()->dimension(0);
 
-        build_opts.emplace("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
-        build_opts.emplace("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
-        build_opts.emplace("-DKERNEL_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
-        build_opts.emplace("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
-        build_opts.emplace("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
-        build_opts.emplace("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
-        build_opts.emplace("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
-        build_opts.emplace("-DPAD_X=" + support::cpp11::to_string(conv_info.pad().first));
-        build_opts.emplace("-DPAD_Y=" + support::cpp11::to_string(conv_info.pad().second));
-        build_opts.emplace("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-        build_opts.emplace("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+        build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+        build_opts.add_option("-DKERNEL_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+        build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(_convolved_dims.first));
+        build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(_convolved_dims.second));
+        build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+        build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+        build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
 
-        if(kernel_dims.width == 3 && kernel_dims.height == 3 && conv_info.pad().first == 0 && conv_info.pad().second == 0)
+        if(kernel_dims.width == 3 && kernel_dims.height == 3 && !conv_info.has_padding())
         {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_kernel3x3_padx0_pady0", build_opts));
+            kernel_name = "im2col_kernel3x3_padx0_pady0";
+
+            // Local work size optimized for the 3x3 MobileNets convolution on Bifrost.
+            if(gpu_target == GPUTarget::BIFROST && input->info()->dimension(0) == 224)
+            {
+                _lws_hint = cl::NDRange(2, 3, 3);
+            }
+        }
+        else if(kernel_dims.width > 1 && !conv_info.has_padding())
+        {
+            kernel_name = "im2col_generic_padx0_pady0";
+
+            // Optimized im2col is performed using one or more vector operations with the specified vector size
+            // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+            // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+            // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+            // Using the vector size of 8, however, may be faster.
+            size_t vector_size = 4;
+            // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+            // is used instead.)
+            if(kernel_dims.width < vector_size)
+            {
+                vector_size = kernel_dims.width;
+            }
+            // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+            if(gpu_target == GPUTarget::BIFROST && kernel_dims.width == 11)
+            {
+                _lws_hint   = cl::NDRange(1, 1, 1);
+                vector_size = 8;
+            }
+            const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+            build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+            build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
         }
         else
         {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+            if(gpu_target == GPUTarget::BIFROST)
+            {
+                const size_t input_channels = input->info()->dimension(2);
+                if((input_channels & (input_channels - 1)) == 0)
+                {
+                    // input_channels is a power of two
+                    _lws_hint = cl::NDRange(1, 1, 4);
+                }
+                else if(input_channels < 192 && (input_channels % 4) == 0)
+                {
+                    // input_channels is less than 192 and is a multiple of 4
+                    _lws_hint = cl::NDRange(1, 1, 2);
+                }
+                // otherwise the default is optimal
+            }
         }
         _run_func = &CLIm2ColKernel::run_generic;
     }
     else
     {
+        kernel_name                        = "im2col_reduced";
         _num_elems_processed_per_iteration = 1;
-        _kernel                            = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
         _run_func                          = &CLIm2ColKernel::run_reduced;
     }
 
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
     // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -174,7 +224,6 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice_in);
         add_2D_tensor_argument(idx, _output, slice_out);
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, _lws_hint);
@@ -203,7 +252,7 @@
 
         _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
         _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
-        enqueue(queue, *this, in_slice);
+        enqueue(queue, *this, in_slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
 }

diff --git a/src/core/CL/kernels/CLL2NormalizeKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
similarity index 92%
rename from src/core/CL/kernels/CLL2NormalizeKernel.cpp
rename to src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 3e0758c..36e351e 100644
--- a/src/core/CL/kernels/CLL2NormalizeKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
@@ -37,12 +37,12 @@
 
 using namespace arm_compute;
 
-CLL2NormalizeKernel::CLL2NormalizeKernel()
+CLL2NormalizeLayerKernel::CLL2NormalizeLayerKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
 {
 }
 
-void CLL2NormalizeKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, unsigned int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
@@ -87,7 +87,7 @@
     ICLKernel::configure(win);
 }
 
-void CLL2NormalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
index 95334c7..3b9fb1f 100644
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp

@@ -53,7 +53,7 @@
     constexpr unsigned int num_elems_written_per_iteration   = 8;
     constexpr unsigned int num_rows_read_per_iteration       = 3;
 
-    Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
 
     AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index a744739..df2104a 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -35,6 +35,62 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+
+    if(is_data_type_fixed_point(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    const unsigned int norm_size = norm_info.norm_size();
+    bool               is_in_map = norm_info.is_in_map();
+
+    const unsigned int border_width = is_in_map ? std::min(norm_size / 2, 3U) : 0;
+    const BorderSize   border_size  = BorderSize(0, border_width);
+
+    const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->data_type())) ? 16 : 4;
+    const unsigned int num_elems_read_per_iteration      = is_in_map ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
+    AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLNormalizationLayerKernel::CLNormalizationLayerKernel()
     : _input(nullptr), _output(nullptr), _border_size(0), _is_in_map(false)
 {
@@ -47,63 +103,65 @@
 
 void CLNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
-    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
-    }
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
 
     _input  = input;
     _output = output;
 
-    _is_in_map                      = (norm_info.type() != NormType::CROSS_MAP);
+    _is_in_map                      = norm_info.is_in_map();
     const unsigned int border_width = _is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
     _border_size                    = BorderSize(0, border_width);
 
     const unsigned int num_elems_processed_per_iteration = (is_data_type_fixed_point(input->info()->data_type())) ? 16 : 4;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const bool         is_in_map_2D                      = (norm_info.type() == NormType::IN_MAP_2D);
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
-    build_opts.emplace(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
-    build_opts.emplace(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
-    build_opts.emplace(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.emplace(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
-    build_opts.emplace(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option(("-DCOEFF=" + float_to_string_with_full_precision(norm_info.scale_coeff())));
+    build_opts.add_option(("-DBETA=" + float_to_string_with_full_precision(norm_info.beta())));
+    build_opts.add_option(("-DKAPPA=" + float_to_string_with_full_precision(norm_info.kappa())));
+    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
+    build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
+    build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
 
     // Create kernel
-    std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    std::string kernel_name = _is_in_map ? "normalization_layer_in_map" : "normalization_layer_cross_map";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), norm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
-    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    // Set config_id for enabling LWS tuning
+    _config_id = "normalization_layer_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(static_cast<std::underlying_type<NormType>::type>(norm_info.type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(norm_info.norm_size());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
 
-    update_window_and_padding(win, input_access, output_access);
+Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 33c8b81..fd5e5d5 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -40,6 +40,65 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_UNUSED(overflow_policy);
+    ARM_COMPUTE_UNUSED(rounding_policy);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+
+    if(is_data_type_fixed_point(input1->data_type()))
+    {
+        // All data types must be all QS8 or all QS16
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
+    }
+
+    // Validate in case of configured output
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                        "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output);
+        if(is_data_type_fixed_point(input1->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
+        }
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
     : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
@@ -64,20 +123,8 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
-    if(is_data_type_fixed_point(input1->info()->data_type()))
-    {
-        // All data types must be all QS8 or all QS16
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output);
-        ARM_COMPUTE_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1.");
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
+                                                  scale, overflow_policy, rounding_policy));
 
     _input1 = input1;
     _input2 = input2;
@@ -161,21 +208,18 @@
     }
 
     // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+Status CLPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-    output_access.set_valid_region(win, valid_region);
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 497e87b..ac368c7 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
@@ -40,6 +41,130 @@
 
 using namespace arm_compute;
 
+namespace
+{
+// Internal window config info
+using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
+
+void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
+{
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(0, pooled_w);
+    output_shape.set(1, pooled_h);
+
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
+                                    "Unsupported combination of parameters!");
+
+    const bool         is_global_pooling = pool_info.is_global_pooling();
+    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
+                                    "Global pooling is supported only with rectangular inputs!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
+                                    "Invalid pool size and pool pad combination!");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        unsigned int pooled_w = 0;
+        unsigned int pooled_h = 0;
+        std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                         input->dimension(1),
+                                                         pool_size,
+                                                         pool_size,
+                                                         pool_info.pad_stride_info());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
+                                        "Invalid output pooling dimensions!");
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    int                 pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Update pool size in case of global pooling
+    pool_size = pool_info.is_global_pooling() ? input->dimension(0) : pool_size;
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                     input->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pad_stride_info);
+
+    auto_init(input, output, pooled_w, pooled_h);
+
+    BorderSize     border_size = BorderSize(pool_pad_y, pool_pad_x);
+    const DataType data_type   = input->data_type();
+
+    const int input_width  = input->dimension(0);
+    const int input_height = input->dimension(1);
+
+    unsigned int num_elems_processed_per_iteration = 1;
+
+    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
+    {
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+
+        int num_elems_read_per_iteration = pool_size;
+        if(is_pool3x3_stride_le3)
+        {
+            // Change the number of elements processed and the number of elements read per iteration
+            // for pooling 3x3 with stride less equal than 3
+            num_elems_processed_per_iteration = 4;
+            num_elems_read_per_iteration      = pool_size * (pool_stride_x + 1);
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    }
+    else
+    {
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    }
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_tuple(err, win, CLPoolingConfig(num_elems_processed_per_iteration, border_size));
+}
+} // namespace
+
 CLPoolingLayerKernel::CLPoolingLayerKernel()
     : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
 {
@@ -59,120 +184,109 @@
     unsigned int        pooled_w        = 0;
     unsigned int        pooled_h        = 0;
     const PoolingType   pool_type       = pool_info.pool_type();
-    const int           pool_size       = pool_info.pool_size();
+    int                 pool_size       = pool_info.pool_size();
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
-    ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Update pool size in case of global pooling
+    pool_size = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_size;
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
                                                      pool_size,
                                                      pool_size,
-                                                     pool_info.pad_stride_info());
+                                                     pad_stride_info);
 
-    // Output auto initialization if not yet initialized
-    {
-        TensorShape output_shape{ input->info()->tensor_shape() };
-        output_shape.set(0, pooled_w);
-        output_shape.set(1, pooled_h);
+    auto_init(input->info(), output->info(), pooled_w, pooled_h);
 
-        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
-
-    const int input_width  = input->info()->dimension(0);
-    const int input_height = input->info()->dimension(1);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
 
     // Set instance variables
-    _input       = input;
-    _output      = output;
-    _pool_info   = pool_info;
-    _border_size = BorderSize(pool_pad_y, pool_pad_x);
+    _input     = input;
+    _output    = output;
+    _pool_info = pool_info;
+
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const DataType  data_type  = input->info()->data_type();
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
-
-    build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
     if(pool_type != PoolingType::MAX)
     {
-        build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
-        build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
-        build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
-        build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
-        build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+        build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)));
+        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)));
+        build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x));
+        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y));
     }
 
     // Create kernel
-    if(pool_size <= 7)
+    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
     {
         // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
         // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
-        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
-        if(is_pool3x3_stride_le3)
-        {
-            // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
-            _num_elems_processed_per_iteration = 4;
-            num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
-        }
-
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-
-        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
-        if(is_pool3x3_stride_le3)
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
-        }
-        else
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-        }
+        std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+                                  + support::cpp11::to_string(pool_size);
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
     else // Run general case
     {
-        _num_elems_processed_per_iteration = 1;
+        build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size));
+        build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
 
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-
-        build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
-        if(input->info()->data_type() == DataType::F16)
-        {
-            build_opts.emplace("-DFP16");
-        }
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
+        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N";
+        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
-    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    ICLKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
+
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    // Configure the local work size (hint) from the first two dimensions of the global work size.
+    // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
+    // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
+    // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config));
+        _lws_hint       = cl::NDRange(gws[0], gws[1], 1);
+    }
+
+    ICLKernel::configure(std::get<1>(win_config));
+
+    CLPoolingConfig pooling_config     = std::get<2>(win_config);
+    _num_elems_processed_per_iteration = pooling_config.first;
+    _border_size                       = pooling_config.second;
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "pooling_layer_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+}
+
+Status CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
+
+    return Status{};
 }
 
 void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -198,7 +312,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, in_slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index e63a5ef..b46bb30 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp

@@ -54,6 +54,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
+    ARM_COMPUTE_UNUSED(border_undefined);
 
     _input  = input;
     _output = output;
@@ -69,12 +70,14 @@
 
     // Configure window
     constexpr unsigned int num_elems_processed_per_iteration = 4;
-    const int              border_offset                     = (border_undefined) ? 0 : border_size().left;
+
+    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
 
     Window             win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic input_access(output->info(), -border_offset, -border_offset,
-                                    _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset);
-    AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index 0131bd3..95f980f 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp

@@ -46,11 +46,13 @@
 
 void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::U16, DataType::S16, DataType::QS16,
                                                   DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
 
     _input  = input;

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 82ebe64..673304a 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -44,7 +44,7 @@
     return BorderSize(1);
 }
 
-void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
@@ -54,11 +54,14 @@
     _input  = input;
     _output = output;
 
-    /* Compute the ratio between source width/height and destination width/height */
+    // Compute the ratio between source width/height and destination width/height
     const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
     const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
 
-    /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+    // Compute actual border size
+    BorderSize border = border_undefined ? BorderSize(0) : border_size();
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
     {
         policy = InterpolationPolicy::NEAREST_NEIGHBOR;
@@ -69,11 +72,15 @@
     }
 
     // Create kernel
-    std::set<std::string> build_opts         = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
-    std::string           interpolation_name = string_from_interpolation_policy(policy);
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
+    build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+
+    std::string interpolation_name = string_from_interpolation_policy(policy);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 4;
@@ -84,15 +91,18 @@
 
     // Reads can occur within the valid region of the input
     AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
+                                    input_valid_region.anchor[0] - border.left, input_valid_region.anchor[1] - border.top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border.right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border.bottom);
 
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()), output->info()->tensor_shape(), policy, border_size(),
+    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
+                                                                     output->info()->tensor_shape(),
+                                                                     policy,
+                                                                     border,
                                                                      border_undefined));
 
     ICLKernel::configure(win);

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index da3b942..04a7639 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp

@@ -33,71 +33,333 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
 
+namespace
+{
+/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options.
+ *
+ * Prepares these build options:
+ * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier.
+ * -DIFF_MIN - threshold difference between maximum value of input data and current processed value,
+ *             it defines whether the value will be taken into account or not.
+ *
+ * @param[in] build_opts  Build options to extend
+ * @param[in] input_scale Input scaling factor
+ * @param[in] beta        Exponent scaling factor beta
+ */
+CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta)
+{
+    // Number of integer bits in temporary fixed-point representation of current-to-max difference
+    static const int scaled_diff_int_bits = 5;
+    // Number of integer bits used in temporary fixed-point representation of exponent accumulator
+    static const int exp_accumulation_in_bits = 12;
+
+    const double beta_multiplier = std::min(
+                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
+                                       (1ll << 31) - 1.0);
+    int input_beta_multiplier, input_beta_left_shift;
+    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
+
+    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1ll << (31 - scaled_diff_int_bits)) / (1ll << input_beta_left_shift);
+    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
+    build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits));
+    build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier));
+    build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift));
+    build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min));
+
+    return build_opts;
+}
+
+// Arguments Validation
+
+Status validate_arguments_1DMax(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        // Softmax across the x dimension
+        TensorShape output_shape{ input->tensor_shape() };
+        output_shape.set(0, 1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+Status validate_arguments_1DShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
+
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        if(is_quantized_asymmetric)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    }
+
+    // Checks performed when sum is configured
+    if(sum->total_size() != 0)
+    {
+        if(is_quantized_asymmetric)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
+    }
+
+    return Status{};
+}
+
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(max, sum, output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    }
+
+    // Checks performed when sum is configured
+    if(sum->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(max, sum);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(max, sum);
+    }
+
+    return Status{};
+}
+
+Status validate_arguments_1DNorm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
+
+    // Note: output should always have a scale of 1/256 and offset 0
+    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+    const bool             is_quantized_asymmetric   = (input->data_type() == DataType::S32);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+        if(!is_quantized_asymmetric)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+            ARM_COMPUTE_RETURN_ERROR_ON(output->quantization_info() != allowed_quantization_info);
+        }
+    }
+
+    return Status{};
+}
+
+// Window validation
+
+std::pair<Status, Window> validate_and_configure_window_1DMax(ITensorInfo *input, ITensorInfo *output)
+{
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(0, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int     num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
+    constexpr unsigned int num_elems_written_per_iteration   = 1;
+
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_1DShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
+{
+    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->data_type());
+    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->data_type();
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum, max->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->fixed_point_position()));
+    auto_init_if_empty(*output, input->clone()->set_data_type(tmp_data_type));
+
+    // The kernel loops over all elements in steps of 16
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->dimension(0), 16);
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max, 0, 1);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum, 0, 1);
+
+    bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_1DMaxShiftExpSum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
+{
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum, input->clone()->set_tensor_shape(max->tensor_shape()));
+    auto_init_if_empty(*output, *input->clone());
+
+    CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo parallel_reduction_info = CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(input->dimension(0));
+    unsigned int                                          vector_size             = std::get<1>(parallel_reduction_info);
+    const unsigned int                                    num_elems_x             = ceil_to_multiple(input->tensor_shape().x(), vector_size);
+    Window                                                win                     = calculate_max_window(*input, Steps(num_elems_x));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_x);
+    AccessWindowHorizontal max_access(max, 0, 1);
+    AccessWindowHorizontal output_access(output, 0, num_elems_x);
+    AccessWindowHorizontal sum_access(sum, 0, 1);
+
+    bool window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_1DNorm(ITensorInfo *input, ITensorInfo *output, ITensorInfo *sum)
+{
+    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+    const bool             is_quantized_asymmetric   = (input->data_type() == DataType::S32);
+    const DataType         output_data_type          = is_quantized_asymmetric ? DataType::QASYMM8 : input->data_type();
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output,
+                       input->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum, 0, 0, 1, sum->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+} // namespace
+
 void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    // Softmax across the x dimension
     TensorShape output_shape{ input->info()->tensor_shape() };
     output_shape.set(0, 1);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMax(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    // The kernel loops over all elements in steps of 16
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+    const DataType data_type = input->info()->data_type();
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
-    else if(input->info()->data_type() == DataType::F16)
-    {
-        build_opts.emplace("-DUSE_F16");
-    }
-
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(data_type == DataType::F16, "-DUSE_F16");
     // Tell the kernel that the width is not a multiple of 16
-    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
-    {
-        build_opts.emplace("-DNON_MULTIPLE_OF_16");
-    }
+    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, "-DNON_MULTIPLE_OF_16");
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+    std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "softmax_layer_max_quantized" : "softmax_layer_max";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set fixed arguments
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
     _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure kernel window
-    constexpr unsigned int num_elems_written_per_iteration = 1;
+    auto win_config = validate_and_configure_window_1DMax(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+    // Set config_id for enabling LWS tuning
+    _config_id = "softmax_layer_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+}
 
-    update_window_and_padding(win, input_access, output_access);
+Status CLLogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMax(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMax(input->clone().get(), output->clone().get()).first);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
@@ -105,67 +367,61 @@
 {
 }
 
-void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
+
+    const bool     is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
+    const DataType tmp_data_type           = is_quantized_asymmetric ? DataType::S32 : input->info()->data_type();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*sum->info(), max->info()->clone()->set_data_type(tmp_data_type).set_fixed_point_position(input->info()->fixed_point_position()));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(tmp_data_type));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
 
     _input  = input;
     _max    = max;
     _output = output;
     _sum    = sum;
 
-    // The kernel loops over all elements in steps of 16
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+    const DataType dt       = input->info()->data_type();
+    auto           beta_int = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
-    else if(input->info()->data_type() == DataType::F16)
-    {
-        build_opts.emplace("-DUSE_F16");
-    }
-
+    CLBuildOptions build_opts;
+    build_opts.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.add_option_if(is_data_type_fixed_point(dt),
+                             std::string("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
+    build_opts.add_option_if(dt == DataType::F16, std::string("-DUSE_F16"));
     // Tell the kernel that the width is not a multiple of 16
-    if((input->info()->dimension(0) % max_cl_vector_width) != 0)
-    {
-        build_opts.emplace("-DNON_MULTIPLE_OF_16");
-    }
+    build_opts.add_option_if((input->info()->dimension(0) % max_cl_vector_width) != 0, std::string("-DNON_MULTIPLE_OF_16"));
+    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), std::string("-DBETA=" + support::cpp11::to_string(beta_int)));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), std::string("-DBETA=" + float_to_string_with_full_precision(beta)));
+    build_opts.add_options_if(is_quantized_asymmetric,
+                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_shift_exp_sum_quantized" : "softmax_layer_shift_exp_sum";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set fixed arguments
     unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
     _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
 
     // Configure window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window_1DShiftExpSum(input->info(), max->info(), output->info(), sum->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max->info(), 0, 1);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+Status CLLogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DShiftExpSum(input, max, output, sum));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
 
-    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -184,7 +440,132 @@
         add_3D_tensor_argument(idx, _max, slice);
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _sum, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+
+/**< Grid size (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_grid_size = 64;
+/**< Vector size in the serial case (obtained through auto-tuning) */
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8;
+/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/
+const unsigned int CLLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4;
+
+CLLogits1DMaxShiftExpSumKernel::CLLogits1DMaxShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DMaxShiftExpSumKernel::configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), input->info()->clone()->set_tensor_shape(max->info()->tensor_shape()));
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info()));
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    const DataType dt                 = input->info()->data_type();
+    const size_t   reduction_dim_size = input->info()->dimension(0);
+    auto           beta_int           = static_cast<int>(lround(beta * (1 << input->info()->fixed_point_position())));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option_if(is_data_type_fixed_point(dt),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option_if(dt == DataType::F16, "-DUSE_F16");
+    build_opts.add_option_if(is_data_type_fixed_point(dt) && (beta != 1.0f), "-DBETA=" + support::cpp11::to_string(beta_int));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+
+    _lws_hint                                     = cl::NullRange;
+    std::string           kernel_name             = std::string("softmax_layer_max_shift_exp_sum_serial");
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size);
+    unsigned int          vector_size             = std::get<1>(parallel_reduction_info);
+
+    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+    build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
+    build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
+
+    // Configure parallel kernel if needed
+    if(std::get<0>(parallel_reduction_info))
+    {
+        kernel_name            = std::string("softmax_layer_max_shift_exp_sum_parallel");
+        bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0);
+        build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size));
+
+        // Handle boundary conditions.
+        const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size;
+        build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE");
+        // Setting _lws_hint in this way can also communicate grid_size to CLLogits1DMaxShiftExpSumKernel::run().
+        // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0].
+        _lws_hint = cl::NDRange(_grid_size);
+    }
+
+    // Create kernel.
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Set static arguments. Both the kernels use the same arguments
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.setArg<cl_uint>(idx++, reduction_dim_size);
+
+    // Configure window
+    auto win_config = validate_and_configure_window_1DMaxShiftExpSum(input->info(), max->info(), output->info(), sum->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
+
+Status CLLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(input, max, output, sum));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DMaxShiftExpSum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
+
+    return Status{};
+}
+
+CLLogits1DMaxShiftExpSumKernel::ParallelReductionInfo CLLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size)
+{
+    bool         is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1);
+    unsigned int vector_size           = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size;
+    return std::make_tuple(is_parallel_reduction, vector_size);
+}
+
+void CLLogits1DMaxShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Collapse window in Z dimension
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    // Reconfigure window in case of parallel reduction
+    ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(_input->info()->dimension(0));
+    if(std::get<0>(parallel_reduction_info))
+    {
+        // To launch grid_size parallel workitems, steps.x should be modified as follows.
+        const unsigned int step = std::get<1>(parallel_reduction_info);
+        window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size * step, step));
+    }
+
+    // Get slices
+    Window slice = window_collapsed.first_slice_window_3D();
+    do
+    {
+        unsigned int idx = 0;
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _max, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _sum, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }
@@ -194,47 +575,50 @@
 {
 }
 
-void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
+
+    // Note: output should always have a scale of 1/256 and offset 0
+    const QuantizationInfo allowed_quantization_info = QuantizationInfo(1.f / 256, 0);
+    const bool             is_quantized_asymmetric   = (input->info()->data_type() == DataType::S32);
+    const DataType         output_data_type          = is_quantized_asymmetric ? DataType::QASYMM8 : input->info()->data_type();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(input->info(), sum->info(), output->info()));
 
     _input  = input;
     _sum    = sum;
     _output = output;
 
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position())));
-    }
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(is_data_type_fixed_point(input->info()->data_type()),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_options_if(is_quantized_asymmetric,
+                              prepare_quantized_softmax_build_options(input->info()->quantization_info().scale, beta).options());
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+    std::string kernel_name = is_quantized_asymmetric ? "softmax_layer_norm_quantized" : "softmax_layer_norm";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    auto win_config = validate_and_configure_window_1DNorm(input->info(), output->info(), sum->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+Status CLLogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(input, sum, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_1DNorm(input->clone().get(), output->clone().get(), sum->clone().get()).first);
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, sum_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -255,7 +639,7 @@
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _sum, sum_slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 75d31d5..3b5fbc9 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,6 +31,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 
 #include <set>
 #include <sstream>
@@ -38,24 +39,80 @@
 
 using namespace arm_compute;
 
-void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+namespace
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t w_out = input->info()->dimension(1);
-    const size_t h_out = input->info()->dimension(0);
+TensorShape transposed_tensor_shape(const TensorShape &in)
+{
+    TensorShape  output_shape{ in };
+    const size_t w_out = in[1];
+    const size_t h_out = in[0];
     output_shape.set(0, w_out);
     output_shape.set(1, h_out);
 
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    return output_shape;
+}
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration), ceil_to_multiple(output->dimension(1),
+                                         num_elems_processed_per_iteration));
+
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+Status CLTransposeKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input    = input;
     _output   = output;
@@ -69,16 +126,7 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts));
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size();
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
-
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 }

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index bc27477..3a9a32e 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp

@@ -41,12 +41,12 @@
 
 void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
+    const DataType data_type = input->info()->data_type();
 
+    // Calculate output shape
     TensorShape output_shape{ input->info()->tensor_shape() };
     output_shape.collapse(3);
     const size_t tmp_dim = output_shape[0];
@@ -54,7 +54,7 @@
     output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -62,6 +62,7 @@
 
     if(biases != nullptr)
     {
+        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(data_type));
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
@@ -75,16 +76,13 @@
     _input  = input;
 
     // Create build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
+    build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts.options()));
 
     // Set static arguments
     unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();

diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index ebe3db9..62a2477 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp

@@ -62,6 +62,7 @@
 void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
new file mode 100644
index 0000000..4b137b0
--- /dev/null
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+    TensorShape output_shape = input->tensor_shape();
+    permute(output_shape, perm);
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 3, "Invalid input size!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0)),
+                                    "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, perm));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <typename T>
+void CPPPermuteKernel::run_permute(const Window &window)
+{
+    const int output_stride_x = _output->info()->strides_in_bytes().x();
+    const int output_stride_y = _output->info()->strides_in_bytes().y();
+    const int output_stride_z = _output->info()->strides_in_bytes().z();
+
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Create iterators
+    Iterator in(_input, window);
+    Iterator out(_output, window_out);
+
+    // Run [2, 0, 1] permute
+    if(_perm[0] == 2 && _perm[1] == 0 && _perm[2] == 1)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int idx                             = id.y() * output_stride_z + id.x() * output_stride_y + id.z() * output_stride_x;
+            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+        },
+        in, out);
+    }
+    // Run [1, 2, 0] permute
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int idx                             = id.x() * output_stride_z + id.z() * output_stride_y + id.y() * output_stride_x;
+            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+        },
+        in, out);
+    }
+}
+
+CPPPermuteKernel::CPPPermuteKernel()
+    : _func(), _input(nullptr), _output(nullptr), _perm()
+{
+}
+
+void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), perm)));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));
+
+    _input  = input;
+    _output = output;
+    _perm   = perm;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &CPPPermuteKernel::run_permute<uint8_t>;
+            break;
+        case 2:
+            _func = &CPPPermuteKernel::run_permute<uint16_t>;
+            break;
+        case 4:
+            _func = &CPPPermuteKernel::run_permute<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The CPPPermute doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    ICPPKernel::configure(win);
+}
+
+Status CPPPermuteKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, perm));
+    return Status{};
+}
+
+void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}

diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index 3b1c7ae..edc5e40 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp

@@ -71,6 +71,7 @@
 void CPPSortEuclideanDistanceKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
 

diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 389e390..b593c27 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp

@@ -28,25 +28,35 @@
 #include <iostream>
 #include <stdexcept>
 
-void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
-{
-    char    out[512];
-    va_list args;
-    va_start(args, msg);
-    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
-    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
-    va_end(args);
+using namespace arm_compute;
 
-    throw std::runtime_error(std::string(out));
+Status arm_compute::create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args)
+{
+    char out[512];
+    int  offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
+    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+
+    return Status(error_code, std::string(out));
 }
 
-void arm_compute::debug(const char *function, const char *file, const int line, const char *msg, ...)
+Status arm_compute::create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...)
 {
-    char    out[512];
     va_list args;
     va_start(args, msg);
-    int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line);
-    vsnprintf(out + offset, sizeof(out) - offset, msg, args);
+    auto err = create_error_va_list(error_code, function, file, line, msg, args);
     va_end(args);
-    std::cout << std::string(out) << std::endl;
+    return err;
+}
+
+void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...)
+{
+    va_list args;
+    va_start(args, msg);
+    auto err = create_error_va_list(ErrorCode::RUNTIME_ERROR, function, file, line, msg, args);
+    va_end(args);
+    throw std::runtime_error(err.error_description());
+}
+void Status::internal_throw_on_error()
+{
+    throw std::runtime_error(_error_description);
 }

diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
new file mode 100644
index 0000000..53a10f9
--- /dev/null
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <regex>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+GCProgram::GCProgram()
+    : _name(), _source()
+{
+}
+
+GCProgram::GCProgram(std::string name, std::string source)
+    : _name(std::move(name)), _source(std::move(source))
+{
+}
+
+GLuint GCProgram::link_program(GLuint shader)
+{
+    GLuint program = ARM_COMPUTE_GL_CHECK(glCreateProgram());
+
+    GLint   rvalue;
+    GLsizei length;
+
+    ARM_COMPUTE_GL_CHECK(glAttachShader(program, shader));
+    ARM_COMPUTE_GL_CHECK(glLinkProgram(program));
+    ARM_COMPUTE_GL_CHECK(glDetachShader(program, shader));
+    ARM_COMPUTE_GL_CHECK(glDeleteShader(shader));
+
+    // Check if there were some issues when linking the shader.
+    ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_LINK_STATUS, &rvalue));
+
+    if(rvalue == 0)
+    {
+        ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length));
+
+        std::vector<GLchar> log(length);
+        ARM_COMPUTE_GL_CHECK(glGetProgramInfoLog(program, length, nullptr, log.data()));
+        ARM_COMPUTE_ERROR("Error: Linker log:\n%s\n", log.data());
+
+        return 0;
+    }
+
+    ARM_COMPUTE_GL_CHECK(glUseProgram(program));
+
+    return program;
+}
+
+GLuint GCProgram::compile_shader(const std::string &build_options)
+{
+    GLuint shader = ARM_COMPUTE_GL_CHECK(glCreateShader(GL_COMPUTE_SHADER));
+
+    const char *src[]
+    {
+        "#version 310 es\n",
+        build_options.c_str(),
+        _source.c_str()
+    };
+
+    ARM_COMPUTE_GL_CHECK(glShaderSource(shader, sizeof(src) / sizeof(src[0]), src, nullptr));
+
+    ARM_COMPUTE_GL_CHECK(glCompileShader(shader));
+
+    // Check if there were any issues when compiling the shader
+    GLint   rvalue;
+    GLsizei length;
+
+    ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &rvalue));
+
+    if(rvalue == 0)
+    {
+        ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length));
+
+        std::vector<GLchar> log(length);
+        ARM_COMPUTE_GL_CHECK(glGetShaderInfoLog(shader, length, nullptr, log.data()));
+
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+        std::istringstream ss(_source);
+        std::stringstream  output_stream;
+        std::string        line;
+        size_t             line_num = 1;
+
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("GLES Shader build options:\n%s\n", build_options.c_str());
+        while(std::getline(ss, line, '\n'))
+        {
+            output_stream << std::setw(6) << line_num << ": " << line << std::endl;
+            line_num++;
+        }
+        ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:\n"
+                                         << output_stream.rdbuf());
+#endif /* ARM_COMPUTE_DEBUG_ENABLED */
+
+        ARM_COMPUTE_ERROR("Error: Compiler log:\n%s\n", log.data());
+
+        return 0;
+    }
+
+    return shader;
+}
+
+GCKernel::GCKernel()
+    : _name(), _program(), _shader_arguments(), _shader_params_ubo_name(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size()
+{
+}
+
+// Add a default destructor in cpp file to workaround the free unallocated value issue on Android
+GCKernel::~GCKernel() // NOLINT
+{
+}
+
+GCKernel::GCKernel(std::string name, GLuint program)
+    : _name(std::move(name)),
+      _program(program),
+      _shader_arguments(),
+      _shader_params_ubo_name(0),
+      _shader_params_binding_point(0),
+      _shader_params_index(0),
+      _shader_params_size(0)
+{
+    _shader_arguments.clear();
+
+    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
+
+    _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
+    ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name);
+}
+
+void GCKernel::cleanup()
+{
+    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params_ubo_name));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
+    ARM_COMPUTE_GL_CHECK(glDeleteProgram(_program));
+    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
+}
+
+void GCKernel::use()
+{
+    ARM_COMPUTE_GL_CHECK(glUseProgram(_program));
+}
+
+void GCKernel::unuse()
+{
+    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
+}
+
+void GCKernel::update_shader_params()
+{
+    ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size != (int)(_shader_arguments.size() * sizeof(_shader_arguments[0]))), "Arguments size (%d) is not equal to shader params block size (%d)",
+                             _shader_arguments.size() * sizeof(_shader_arguments[0]), _shader_params_size);
+
+    ARM_COMPUTE_GL_CHECK(glUniformBlockBinding(_program, _shader_params_index, _shader_params_binding_point));
+    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params_ubo_name));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params_ubo_name));
+    ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _shader_arguments.data(), GL_DYNAMIC_DRAW));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
+}
+
+const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
+{
+    { "absdiff", "absdiff.cs" },
+    { "col2im", "convolution_layer.cs" },
+    { "direct_convolution1x1", "direct_convolution1x1.cs" },
+    { "direct_convolution3x3", "direct_convolution3x3.cs" },
+    { "direct_convolution5x5", "direct_convolution5x5.cs" },
+    { "pooling_layer_2", "pooling_layer.cs" },
+    { "pooling_layer_3", "pooling_layer.cs" },
+    { "pooling_layer_7", "pooling_layer.cs" },
+    { "pooling_layer_3_optimized", "pooling_layer.cs" },
+    { "pooling_layer_n", "pooling_layer.cs" },
+    { "fill_image_borders_replicate", "fill_border.cs" },
+    { "fill_image_borders_constant", "fill_border.cs" },
+    { "gemm_accumulate_biases", "gemm.cs" },
+    { "gemm_interleave4x4", "gemm.cs" },
+    { "gemm_ma", "gemm.cs" },
+    { "gemm_mm_interleaved_transposed", "gemm.cs" },
+    { "gemm_mm_floating_point", "gemm.cs" },
+    { "gemm_transpose1x4", "gemm.cs" },
+    { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" },
+    { "im2col_generic", "convolution_layer.cs" },
+    { "im2col_reduced", "convolution_layer.cs" },
+    { "transpose", "transpose.cs" },
+    { "activation_layer", "activation_layer.cs" },
+    { "softmax_layer_max", "softmax_layer.cs" },
+    { "softmax_layer_shift_exp_sum", "softmax_layer.cs" },
+    { "softmax_layer_norm", "softmax_layer.cs" },
+    { "pixelwise_mul_float", "pixelwise_mul_float.cs" },
+    { "normalization_layer", "normalization_layer.cs" },
+    { "batchnormalization_layer", "batchnormalization_layer.cs" },
+    { "concatenate_depth", "concatenate.cs" },
+    { "dropout", "dropout.cs" },
+};
+
+const std::map<std::string, std::string> GCKernelLibrary::_program_source_map =
+{
+#ifdef EMBEDDED_KERNELS
+    {
+        "absdiff.cs",
+#include "./cs_shaders/absdiff.csembed"
+    },
+    {
+        "convolution_layer.cs",
+#include "./cs_shaders/convolution_layer.csembed"
+    },
+    {
+        "direct_convolution1x1.cs",
+#include "./cs_shaders/direct_convolution1x1.csembed"
+    },
+    {
+        "direct_convolution3x3.cs",
+#include "./cs_shaders/direct_convolution3x3.csembed"
+    },
+    {
+        "direct_convolution5x5.cs",
+#include "./cs_shaders/direct_convolution5x5.csembed"
+    },
+    {
+        "pooling_layer.cs",
+#include "./cs_shaders/pooling_layer.csembed"
+    },
+    {
+        "fill_border.cs",
+#include "./cs_shaders/fill_border.csembed"
+    },
+    {
+        "gemm.cs",
+#include "./cs_shaders/gemm.csembed"
+    },
+    {
+        "transpose.cs",
+#include "./cs_shaders/transpose.csembed"
+    },
+    {
+        "activation_layer.cs",
+#include "./cs_shaders/activation_layer.csembed"
+    },
+    {
+        "softmax_layer.cs",
+#include "./cs_shaders/softmax_layer.csembed"
+    },
+    {
+        "pixelwise_mul_float.cs",
+#include "./cs_shaders/pixelwise_mul_float.csembed"
+    },
+    {
+        "normalization_layer.cs",
+#include "./cs_shaders/normalization_layer.csembed"
+    },
+    {
+        "batchnormalization_layer.cs",
+#include "./cs_shaders/batchnormalization_layer.csembed"
+    },
+    {
+        "concatenate.cs",
+#include "./cs_shaders/concatenate.csembed"
+    },
+    {
+        "dropout.cs",
+#include "./cs_shaders/dropout.csembed"
+    },
+#endif /* EMBEDDED_KERNELS */
+};
+
+GCKernelLibrary::GCKernelLibrary()
+    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _own_context(false), _shader_path("./"), _programs_map(), _built_programs_map()
+{
+}
+
+GCKernelLibrary &GCKernelLibrary::get()
+{
+    static GCKernelLibrary _kernel_library;
+    return _kernel_library;
+}
+
+GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const StringSet &build_options_set) const
+{
+    // Find which program contains the kernel
+    auto shader_program_it = _shader_program_map.find(shader_name);
+
+    if(_shader_program_map.end() == shader_program_it)
+    {
+        ARM_COMPUTE_ERROR("Shader %s not found in the GCKernelLibrary", shader_name.c_str());
+    }
+
+    // Check if the program has been built before with same build options.
+    const std::string program_name       = shader_program_it->second;
+    const std::string build_options      = stringify_set(build_options_set);
+    const std::string built_program_name = program_name + "_" + build_options;
+    auto              built_program_it   = _built_programs_map.find(built_program_name);
+
+    GCKernel kernel;
+
+    if(_built_programs_map.end() != built_program_it)
+    {
+        // If program has been built, retrieve to create kernel from it
+        kernel = built_program_it->second;
+    }
+    else
+    {
+        GCProgram program = load_program(program_name);
+
+        std::string source_name = _shader_path + shader_program_it->second;
+
+        // load shader
+        GLuint shader = program.compile_shader(build_options);
+
+        // Build program
+        GLuint gles_program = program.link_program(shader);
+
+        // Create GCKernel
+        kernel = GCKernel(shader_name, gles_program);
+
+        // Add built program to internal map
+        _built_programs_map.emplace(built_program_name, kernel);
+    }
+
+    kernel.use();
+    kernel.clear_arguments();
+    // set shader params binding point
+    kernel.set_shader_params_binding_point(0);
+
+    return kernel;
+}
+
+const std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
+{
+    enum class ParserStage
+    {
+        FIRST,
+        SKIP_COMMENTS = FIRST,
+        RESOLVE_INCLUDES,
+        SKIP_PREPROCESSOR_DIRECTIVES,
+        SEARCH_MACRO_DEFINITIONS,
+        EXPAND_MACRO_USES,
+        LAST
+    };
+
+    struct MacroDefinitionInfo
+    {
+        const std::vector<std::string> param_list;
+        const std::string              content;
+    };
+
+    // Found macro definitions so far
+    std::map<const std::string, const MacroDefinitionInfo> macro_definitions;
+
+    // Define a GLES compute shader parser function
+    std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
+    cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string
+    {
+        std::string dst;
+
+        if(stage == ParserStage::LAST || std::regex_match(src, std::regex(R"(\s*)")))
+        {
+            return src;
+        }
+        auto next_stage = static_cast<ParserStage>(static_cast<int>(stage) + 1);
+
+        std::string search_pattern;
+        switch(stage)
+        {
+            case ParserStage::SKIP_COMMENTS:
+                search_pattern = R"((/\*([^*]|\n|(\*+([^*/]|\n)))*\*+/)|(//.*))";
+                break;
+            case ParserStage::RESOLVE_INCLUDES:
+                search_pattern = R"rgx((?:^|\n)[ \t]*#include "(.*)")rgx";
+                break;
+            case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
+                search_pattern = R"((^|\n)[ \t]*(#ifdef|#ifndef|#if)[^\n]+)";
+                break;
+            case ParserStage::SEARCH_MACRO_DEFINITIONS:
+                search_pattern = R"((?:^|\n)[ \t]*#define[ \t]+(\w+)(?:\((\w+(?:[ \t]*,[ \t]*\w+)*)\))?(?: |\t|\\\n)*((?:(?:[^\\\n]|\\[^\n])*\\+\n)*(?:[ \t]*[^ \t\n]+)*)[ \t]*)";
+                break;
+            case ParserStage::EXPAND_MACRO_USES:
+            {
+                if(macro_definitions.empty())
+                {
+                    // Nothing to expand
+                    return src;
+                }
+                int i = 0;
+                for(auto &def : macro_definitions)
+                {
+                    if(i == 0)
+                    {
+                        search_pattern = R"((\b)" + def.first;
+                    }
+                    else
+                    {
+                        search_pattern += R"(\b|\b)" + def.first;
+                    }
+                    i++;
+                }
+                search_pattern += R"(\b))";
+                break;
+            }
+            default:
+                break;
+        }
+
+        std::regex  search_regex(search_pattern);
+        std::smatch match;
+        ptrdiff_t   parsed_pos = 0;
+        if(std::regex_search(src, match, search_regex))
+        {
+            // Pass the content before the match to the next stage
+            dst.append(cs_parser(src.substr(0, match.position()), next_stage, 0));
+            parsed_pos = match.position() + match.length();
+
+            // Deal with the matched content
+            switch(stage)
+            {
+                case ParserStage::RESOLVE_INCLUDES:
+                {
+                    // Replace with the included file contents
+                    // And parse the content from the first stage
+                    const std::string source_name = _shader_path + match.str(1);
+                    dst.append(cs_parser(read_file(source_name, false), ParserStage::FIRST, 0));
+                    break;
+                }
+                case ParserStage::SEARCH_MACRO_DEFINITIONS:
+                {
+                    std::regex                     params_regex(R"(\b\w+\b)");
+                    const std::string              macro_param_str = match.str(2);
+                    const std::vector<std::string> macro_param_list(
+                        std::sregex_token_iterator(macro_param_str.begin(),
+                                                   macro_param_str.end(),
+                                                   params_regex),
+                        std::sregex_token_iterator());
+
+                    const MacroDefinitionInfo info =
+                    {
+                        macro_param_list,
+                        match.str(3)
+                    };
+                    // Collect the macro definition data and not change the shader source
+                    macro_definitions.insert(std::pair<const std::string, const MacroDefinitionInfo>(match.str(1), info));
+                    dst.append(match.str());
+                    break;
+                }
+                case ParserStage::EXPAND_MACRO_USES:
+                {
+                    ptrdiff_t                args_str_length = 0;
+                    std::vector<std::string> args_list;
+
+                    // Walk through argument list, because the regular expression does NOT support nested parentheses
+                    size_t cur_args_str_pos = match.position() + match.length();
+                    if(src[cur_args_str_pos++] == '(')
+                    {
+                        int       nested_parentheses = 0;
+                        ptrdiff_t cur_arg_pos        = cur_args_str_pos;
+                        ptrdiff_t cur_arg_length     = 0;
+
+                        args_str_length++;
+                        while(src[cur_args_str_pos] != ')' || nested_parentheses != 0)
+                        {
+                            switch(src[cur_args_str_pos++])
+                            {
+                                case '(':
+                                    nested_parentheses++;
+                                    cur_arg_length++;
+                                    break;
+                                case ',':
+                                    if(nested_parentheses == 0)
+                                    {
+                                        args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
+                                        cur_arg_pos    = cur_args_str_pos;
+                                        cur_arg_length = 0;
+                                    }
+                                    else
+                                    {
+                                        cur_arg_length++;
+                                    }
+                                    break;
+                                case ' ':
+                                case '\t':
+                                    if(cur_arg_length == 0)
+                                    {
+                                        cur_arg_pos++;
+                                    }
+                                    else
+                                    {
+                                        cur_arg_length++;
+                                    }
+                                    break;
+                                case ')':
+                                    nested_parentheses--;
+                                // no break here!
+                                default:
+                                    cur_arg_length++;
+                                    break;
+                            }
+                            args_str_length++;
+                        }
+                        if(src[cur_args_str_pos] == ')' && nested_parentheses == 0)
+                        {
+                            args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
+                        }
+                        args_str_length++;
+                    }
+
+                    std::string                    expanded_content = match.str();
+                    const std::vector<std::string> macro_param_list = macro_definitions.at(match.str()).param_list;
+
+                    if((nested_level != 0 || !macro_param_list.empty()) && macro_param_list.size() == args_list.size())
+                    {
+                        parsed_pos += args_str_length;
+                        expanded_content = macro_definitions.at(match.str()).content;
+                        size_t i         = 0;
+                        for(auto &param_name : macro_param_list)
+                        {
+                            std::regex params_regex(R"(\b)" + param_name + R"(\b)");
+                            expanded_content.assign(std::regex_replace(expanded_content, params_regex, args_list[i]));
+                            ++i;
+                        }
+                        // Expand macro recursively
+                        expanded_content = cs_parser(expanded_content, stage, nested_level + 1);
+
+                        if(nested_level == 0)
+                        {
+                            const std::regex token_pasting_rgx = std::regex(R"(\b##\b)");
+                            if(std::regex_search(expanded_content, token_pasting_rgx))
+                            {
+                                // Remove token pasting operator "##"
+                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(token_pasting_rgx), ""));
+                                // Trim trailing whitespace
+                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(R"([ \t]*\\\n)"), "\n"));
+                            }
+                            else
+                            {
+                                // Do not expand the macro if the result does not have token pasting operator "##"
+                                expanded_content = src.substr(match.position(), match.length() + args_str_length);
+                            }
+                        }
+                    }
+                    dst.append(expanded_content);
+                    break;
+                }
+                case ParserStage::SKIP_COMMENTS:
+                case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
+                default:
+                    dst.append(match.str());
+                    break;
+            }
+            next_stage = stage;
+        }
+        dst.append(cs_parser(src.substr(parsed_pos, src.length() - parsed_pos), next_stage, 0));
+
+        return dst;
+    };
+
+    return cs_parser(shader_source, ParserStage::FIRST, 0);
+}
+
+const GCProgram &GCKernelLibrary::load_program(const std::string &program_name) const
+{
+    const auto program_it = _programs_map.find(program_name);
+
+    if(program_it != _programs_map.end())
+    {
+        return program_it->second;
+    }
+
+    GCProgram program;
+
+#ifdef EMBEDDED_KERNELS
+    const auto program_source_it = _program_source_map.find(program_name);
+
+    if(_program_source_map.end() == program_source_it)
+    {
+        ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+
+    //       We should do the preprocess at compile time
+    //       The preprocess_shader function is used for support "#include" directive and token pasting operator "##".
+    //       This job could be done at compile time by using a python script in order to get better performance at runtime.
+    //       BTW: We usually defined EMBEDDED_KERNELS in release build.
+    program = GCProgram(program_name, preprocess_shader(program_source_it->second));
+#else  /* EMBEDDED_KERNELS */
+    // Check for binary
+    std::string source_name = _shader_path + program_name;
+    if(std::ifstream(source_name).is_open())
+    {
+        program = GCProgram(program_name, preprocess_shader(read_file(source_name, false)));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Shader file %s does not exist.", source_name.c_str());
+    }
+#endif /* EMBEDDED_KERNELS */
+
+    // Insert program to program map
+    const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+    return new_program.first->second;
+}
+
+void GCKernelLibrary::setup_context()
+{
+    EGLBoolean res;
+    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
+
+    res = eglInitialize(_display, nullptr, nullptr);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
+    ARM_COMPUTE_UNUSED(egl_extension_st);
+
+    const EGLint config_attribs[] =
+    {
+        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
+        EGL_NONE
+    };
+    EGLConfig cfg;
+    EGLint    count;
+
+    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglBindAPI(EGL_OPENGL_ES_API);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
+
+    const EGLint attribs[] =
+    {
+        EGL_CONTEXT_CLIENT_VERSION, 3,
+        EGL_NONE
+    };
+    _context = eglCreateContext(_display,
+                                cfg,
+                                EGL_NO_CONTEXT,
+                                attribs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+}
+
+void GCKernelLibrary::setup_dummy_fbo()
+{
+    ARM_COMPUTE_GL_CHECK(glGenFramebuffers(1, &_frame_buffer));
+    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, _frame_buffer));
+    ARM_COMPUTE_GL_CHECK(glGenTextures(1, &_tex_rt));
+    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, _tex_rt));
+    ARM_COMPUTE_GL_CHECK(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 1, 1, 0, GL_RGB, GL_UNSIGNED_BYTE, nullptr));
+    ARM_COMPUTE_GL_CHECK(glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _tex_rt, 0));
+}
+
+GCKernelLibrary::~GCKernelLibrary()
+{
+    for(auto &program : _built_programs_map)
+    {
+        static_cast<GCKernel>(program.second).cleanup();
+    }
+
+    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, 0));
+    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, 0));
+    ARM_COMPUTE_GL_CHECK(glDeleteTextures(1, &_tex_rt));
+    ARM_COMPUTE_GL_CHECK(glDeleteFramebuffers(1, &_frame_buffer));
+
+    if(_own_context)
+    {
+        eglDestroyContext(_display, _context);
+        eglTerminate(_display);
+
+        _context = EGL_NO_CONTEXT;
+        _display = EGL_NO_DISPLAY;
+    }
+}
+
+std::string GCKernelLibrary::stringify_set(const StringSet &s) const
+{
+    std::string concat_set;
+
+    // Concatenate set
+    for(const auto &el : s)
+    {
+        concat_set += el + "\n";
+    }
+
+    return concat_set;
+}

diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
new file mode 100644
index 0000000..c60c167
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp

@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+#include <cstddef>
+#include <sstream>
+
+using namespace arm_compute;
+
+void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws)
+{
+    ARM_COMPUTE_UNUSED(kernel);
+
+    if(kernel.kernel().get_program() == 0)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
+
+    ARM_COMPUTE_ERROR_ON_MSG((((window.x().end() - window.x().start()) % (window.x().step() * lws[0])) != 0),
+                             "window x end =%d, start=%d, step=%d, lws x=%d", window.x().end(), window.x().start(), window.x().step(), lws[0]);
+    ARM_COMPUTE_ERROR_ON_MSG((((window.y().end() - window.y().start()) % (window.y().step() * lws[1])) != 0),
+                             "window y end =%d, start=%d, step=%d, lws y=%d", window.y().end(), window.y().start(), window.y().step(), lws[1]);
+    ARM_COMPUTE_ERROR_ON_MSG((((window.z().end() - window.z().start()) % (window.z().step() * lws[2])) != 0),
+                             "window z end =%d, start=%d, step=%d, lws z=%d", window.z().end(), window.z().start(), window.z().step(), lws[2]);
+
+    ARM_COMPUTE_GL_CHECK(glDispatchCompute(((window.x().end() - window.x().start()) / window.x().step()) / lws[0],
+                                           ((window.y().end() - window.y().start()) / window.y().step()) / lws[1],
+                                           ((window.z().end() - window.z().start()) / window.z().step()) / lws[2]));
+}
+
+IGCKernel::IGCKernel()
+    : _kernel()
+{
+}
+
+GCKernel &IGCKernel::kernel()
+{
+    return _kernel;
+}
+
+template <unsigned int dimension_size>
+unsigned int           IGCKernel::num_arguments_per_tensor() const
+{
+    // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
+    return ceil_to_multiple(1 + 2 * dimension_size, 4);
+}
+
+template <unsigned int dimension_size>
+void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const ITensorInfo *info    = tensor->info();
+    const Strides     &strides = info->strides_in_bytes();
+
+    // Calculate offset to the start of the window
+    unsigned int offset_first_element = info->offset_first_element_in_bytes();
+
+    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    {
+        offset_first_element += window[n].start() * strides[n];
+    }
+
+    unsigned int idx_start = idx;
+
+    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    {
+        _kernel.set_argument(idx++, strides[dimension]);
+        _kernel.set_argument(idx++, strides[dimension] * window[dimension].step());
+    }
+
+    _kernel.set_argument(idx++, offset_first_element);
+    _kernel.set_argument(idx++, param.buffer_data_type_shift);
+
+    // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
+    unsigned int idx_end = ceil_to_multiple(idx, 4);
+    for(unsigned int i = idx; i < idx_end; ++i)
+    {
+        _kernel.set_argument(i, 0);
+    }
+    idx = idx_end;
+
+    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, param.binding_point, tensor->gc_buffer()));
+
+    ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
+                             "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+    ARM_COMPUTE_UNUSED(idx_start);
+}
+
+void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<1>(idx, tensor, param, window);
+}
+
+void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<2>(idx, tensor, param, window);
+}
+
+void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, BufferParam(binding_point, 0), window);
+}
+
+void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+{
+    add_tensor_argument<3>(idx, tensor, param, window);
+}
+
+unsigned int IGCKernel::num_arguments_per_1D_tensor() const
+{
+    return num_arguments_per_tensor<1>();
+}
+
+unsigned int IGCKernel::num_arguments_per_2D_tensor() const
+{
+    return num_arguments_per_tensor<2>();
+}
+
+unsigned int IGCKernel::num_arguments_per_3D_tensor() const
+{
+    return num_arguments_per_tensor<3>();
+}

diff --git a/src/core/Logger.cpp b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
similarity index 61%
copy from src/core/Logger.cpp
copy to src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
index 9c3bf26..5bb479e 100644
--- a/src/core/Logger.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp

@@ -21,36 +21,31 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+void IGCSimple2DKernel::run(const Window &window)
 {
-}
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
 
-Logger &Logger::get()
-{
-    static Logger _instance;
-    return _instance;
-}
+    _kernel.use();
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
-{
-    _ostream   = &ostream;
-    _verbosity = verbosity;
-}
+    Window slice = window.first_slice_window_2D();
 
-std::ostream &Logger::log_info()
-{
-    if(_verbosity == LoggerVerbosity::INFO)
+    do
     {
-        return *_ostream;
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_2D_tensor_argument(idx, _output, 2, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
     }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
new file mode 100644
index 0000000..61225d8
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void IGCSimple3DKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
new file mode 100644
index 0000000..459601e
--- /dev/null
+++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+IGCSimpleKernel::IGCSimpleKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void IGCSimpleKernel::configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+{
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win,
+                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
+
+    IGCKernel::configure(win);
+}

diff --git a/src/core/Logger.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
similarity index 70%
rename from src/core/Logger.cpp
rename to src/core/GLES_COMPUTE/IGCTensor.cpp
index 9c3bf26..5576665 100644
--- a/src/core/Logger.cpp
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp

@@ -21,36 +21,34 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+IGCTensor::IGCTensor()
+    : _mapping(nullptr)
 {
 }
 
-Logger &Logger::get()
+void IGCTensor::map(bool blocking)
 {
-    static Logger _instance;
-    return _instance;
+    _mapping = do_map(blocking);
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+void IGCTensor::unmap()
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    do_unmap();
+    _mapping = nullptr;
 }
 
-std::ostream &Logger::log_info()
+void IGCTensor::clear()
 {
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file
+    this->map();
+    std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size());
+    this->unmap();
+}
+
+uint8_t *IGCTensor::buffer() const
+{
+    return _mapping;
+}

diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
new file mode 100644
index 0000000..d2539d0
--- /dev/null
+++ b/src/core/GLES_COMPUTE/OpenGLES.cpp

@@ -0,0 +1,812 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+
+#include <dlfcn.h>
+#include <iostream>
+#include <vector>
+
+using eglGetProcAddress_func         = __eglMustCastToProperFunctionPointerType EGLAPIENTRY (*)(const char *procname);
+using eglBindAPI_func                = EGLBoolean EGLAPIENTRY (*)(EGLenum api);
+using eglChooseConfig_func           = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+using eglCreateContext_func          = EGLContext EGLAPIENTRY (*)(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
+using eglDestroyContext_func         = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLContext ctx);
+using eglGetDisplay_func             = EGLDisplay EGLAPIENTRY (*)(EGLNativeDisplayType display_id);
+using eglInitialize_func             = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLint *major, EGLint *minor);
+using eglMakeCurrent_func            = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
+using eglTerminate_func              = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy);
+using eglGetError_func               = EGLint         EGLAPIENTRY (*)();
+using eglQueryString_func            = char const * EGLAPIENTRY (*)(EGLDisplay dpy, EGLint name);
+using glAttachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
+using glCompileShader_func           = void GL_APIENTRY (*)(GLuint shader);
+using glCreateProgram_func           = GLuint GL_APIENTRY (*)();
+using glCreateShader_func            = GLuint GL_APIENTRY (*)(GLenum type);
+using glDeleteProgram_func           = void GL_APIENTRY (*)(GLuint program);
+using glDeleteShader_func            = void GL_APIENTRY (*)(GLuint shader);
+using glDetachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
+using glGetProgramInfoLog_func       = void GL_APIENTRY (*)(GLuint program, GLsizei bufsize, GLsizei *length, GLchar *infolog);
+using glGetProgramiv_func            = void GL_APIENTRY (*)(GLuint program, GLenum pname, GLint *params);
+using glGetShaderInfoLog_func        = void GL_APIENTRY (*)(GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *infolog);
+using glGetShaderiv_func             = void GL_APIENTRY (*)(GLuint shader, GLenum pname, GLint *params);
+using glLinkProgram_func             = void GL_APIENTRY (*)(GLuint program);
+using glShaderSource_func            = void GL_APIENTRY (*)(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length);
+using glUseProgram_func              = void GL_APIENTRY (*)(GLuint program);
+using glBindBuffer_func              = void GL_APIENTRY (*)(GLenum target, GLuint buffer);
+using glBindBufferBase_func          = void GL_APIENTRY (*)(GLenum target, GLuint index, GLuint buffer);
+using glBufferData_func              = void GL_APIENTRY (*)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
+using glDeleteBuffers_func           = void GL_APIENTRY (*)(GLsizei n, const GLuint *buffers);
+using glDispatchCompute_func         = void GL_APIENTRY (*)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
+using glFlush_func                   = void      GL_APIENTRY (*)();
+using glGenBuffers_func              = void GL_APIENTRY (*)(GLsizei n, GLuint *buffers);
+using glGetProgramResourceIndex_func = GLuint GL_APIENTRY (*)(GLuint program, GLenum programInterface, const GLchar *name);
+using glGetUniformLocation_func      = GLint GL_APIENTRY (*)(GLuint program, const GLchar *name);
+using glMapBufferRange_func          = void *GL_APIENTRY (*)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
+using glMemoryBarrier_func           = void GL_APIENTRY (*)(GLbitfield barriers);
+using glUniform1ui_func              = void GL_APIENTRY (*)(GLint location, GLuint v0);
+using glUnmapBuffer_func             = GLboolean GL_APIENTRY (*)(GLenum target);
+using glGetError_func                = GLenum              GL_APIENTRY (*)();
+using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
+using glUniformBlockBinding_func     = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
+using glGetUniformBlockIndex_func    = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName);
+using glGenTextures_func             = void GL_APIENTRY (*)(GLsizei n, GLuint *textures);
+using glDeleteTextures_func          = void GL_APIENTRY (*)(GLsizei n, const GLuint *textures);
+using glBindTexture_func             = void GL_APIENTRY (*)(GLenum target, GLuint texture);
+using glTexImage2D_func              = void GL_APIENTRY (*)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type,
+                                                            const GLvoid *pixels);
+using glGenFramebuffers_func      = void GL_APIENTRY (*)(GLsizei n, GLuint *framebuffers);
+using glDeleteFramebuffers_func   = void GL_APIENTRY (*)(GLsizei n, const GLuint *framebuffers);
+using glBindFramebuffer_func      = void GL_APIENTRY (*)(GLenum target, GLuint framebuffer);
+using glFramebufferTexture2D_func = void GL_APIENTRY (*)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
+
+class GLESSymbols
+{
+private:
+    void init()
+    {
+        void *egl_handle    = dlopen("libEGL.so", RTLD_LAZY | RTLD_LOCAL);
+        void *glesv2_handle = dlopen("libGLESv2.so", RTLD_LAZY | RTLD_LOCAL);
+        void *glesv3_handle = dlopen("libGLESv3.so", RTLD_LAZY | RTLD_LOCAL);
+        if(egl_handle == nullptr)
+        {
+            std::cerr << "Can't load libEGL.so: " << dlerror() << std::endl;
+        }
+        else
+        {
+#undef EGL_ENTRY
+#define EGL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(dlsym(egl_handle, #_api));
+#include "./egl_entries.in"
+#undef EGL_ENTRY
+
+            if(eglGetProcAddress != nullptr)
+            {
+#undef EGL_ENTRY
+#define EGL_ENTRY(_api)   \
+    if((_api) == nullptr) \
+        (_api) = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
+#include "./egl_entries.in"
+#undef EGL_ENTRY
+
+#undef GL_ENTRY
+#define GL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
+#include "./gl_entries.in"
+#undef GL_ENTRY
+            }
+
+            std::vector<void *> handles = { glesv3_handle, glesv2_handle };
+            for(auto &handle : handles)
+            {
+                if(handle != nullptr)
+                {
+#undef GL_ENTRY
+#define GL_ENTRY(_api)    \
+    if((_api) == nullptr) \
+        (_api) = reinterpret_cast<_api##_func>(dlsym(handle, #_api));
+#include "./gl_entries.in"
+#undef GL_ENTRY
+                }
+            }
+
+            //Don't call dlclose(handle) or all the symbols will be unloaded !
+        }
+    }
+    bool _initialized = false;
+
+public:
+    static GLESSymbols &get()
+    {
+        static GLESSymbols symbols = GLESSymbols();
+        if(!symbols._initialized)
+        {
+            symbols._initialized = true;
+            symbols.init();
+        }
+
+        return symbols;
+    }
+
+#undef EGL_ENTRY
+#undef GL_ENTRY
+#define EGL_ENTRY(_api) _api##_func _api = nullptr;
+#define GL_ENTRY(_api) EGL_ENTRY(_api)
+#include "./egl_entries.in"
+#include "./gl_entries.in"
+#undef EGL_ENTRY
+#undef GL_ENTRY
+};
+
+bool arm_compute::opengles31_is_available()
+{
+    return GLESSymbols::get().glDispatchCompute != nullptr;
+}
+
+__eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress(const char *procname)
+{
+    auto func = GLESSymbols::get().eglGetProcAddress;
+    if(func != nullptr)
+    {
+        return func(procname);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api)
+{
+    auto func = GLESSymbols::get().eglBindAPI;
+    if(func != nullptr)
+    {
+        return func(api);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config)
+{
+    auto func = GLESSymbols::get().eglChooseConfig;
+    if(func != nullptr)
+    {
+        return func(dpy, attrib_list, configs, config_size, num_config);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list)
+{
+    auto func = GLESSymbols::get().eglCreateContext;
+    if(func != nullptr)
+    {
+        return func(dpy, config, share_context, attrib_list);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx)
+{
+    auto func = GLESSymbols::get().eglDestroyContext;
+    if(func != nullptr)
+    {
+        return func(dpy, ctx);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id)
+{
+    auto func = GLESSymbols::get().eglGetDisplay;
+    if(func != nullptr)
+    {
+        return func(display_id);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
+{
+    auto func = GLESSymbols::get().eglInitialize;
+    if(func != nullptr)
+    {
+        return func(dpy, major, minor);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx)
+{
+    auto func = GLESSymbols::get().eglMakeCurrent;
+    if(func != nullptr)
+    {
+        return func(dpy, draw, read, ctx);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy)
+{
+    auto func = GLESSymbols::get().eglTerminate;
+    if(func != nullptr)
+    {
+        return func(dpy);
+    }
+    else
+    {
+        return EGL_FALSE;
+    }
+}
+
+EGLint EGLAPIENTRY eglGetError()
+{
+    auto func = GLESSymbols::get().eglGetError;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return GL_NO_ERROR;
+    }
+}
+
+char const *EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name)
+{
+    auto func = GLESSymbols::get().eglQueryString;
+    if(func != nullptr)
+    {
+        return func(dpy, name);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void GL_APIENTRY glAttachShader(GLuint program, GLuint shader)
+{
+    auto func = GLESSymbols::get().glAttachShader;
+    if(func != nullptr)
+    {
+        return func(program, shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glCompileShader(GLuint shader)
+{
+    auto func = GLESSymbols::get().glCompileShader;
+    if(func != nullptr)
+    {
+        return func(shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glCreateProgram()
+{
+    auto func = GLESSymbols::get().glCreateProgram;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+GLuint GL_APIENTRY glCreateShader(GLenum type)
+{
+    auto func = GLESSymbols::get().glCreateShader;
+    if(func != nullptr)
+    {
+        return func(type);
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void GL_APIENTRY glDeleteProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glDeleteProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteShader(GLuint shader)
+{
+    auto func = GLESSymbols::get().glDeleteShader;
+    if(func != nullptr)
+    {
+        return func(shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDetachShader(GLuint program, GLuint shader)
+{
+    auto func = GLESSymbols::get().glDetachShader;
+    if(func != nullptr)
+    {
+        return func(program, shader);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+{
+    auto func = GLESSymbols::get().glGetProgramInfoLog;
+    if(func != nullptr)
+    {
+        return func(program, bufSize, length, infoLog);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetProgramiv(GLuint program, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetProgramiv;
+    if(func != nullptr)
+    {
+        return func(program, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
+{
+    auto func = GLESSymbols::get().glGetShaderInfoLog;
+    if(func != nullptr)
+    {
+        return func(shader, bufSize, length, infoLog);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGetShaderiv(GLuint shader, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetShaderiv;
+    if(func != nullptr)
+    {
+        return func(shader, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glLinkProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glLinkProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glShaderSource(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length)
+{
+    auto func = GLESSymbols::get().glShaderSource;
+    if(func != nullptr)
+    {
+        return func(shader, count, string, length);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUseProgram(GLuint program)
+{
+    auto func = GLESSymbols::get().glUseProgram;
+    if(func != nullptr)
+    {
+        return func(program);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindBuffer(GLenum target, GLuint buffer)
+{
+    auto func = GLESSymbols::get().glBindBuffer;
+    if(func != nullptr)
+    {
+        return func(target, buffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindBufferBase(GLenum target, GLuint index, GLuint buffer)
+{
+    auto func = GLESSymbols::get().glBindBufferBase;
+    if(func != nullptr)
+    {
+        return func(target, index, buffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage)
+{
+    auto func = GLESSymbols::get().glBufferData;
+    if(func != nullptr)
+    {
+        return func(target, size, data, usage);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteBuffers(GLsizei n, const GLuint *buffers)
+{
+    auto func = GLESSymbols::get().glDeleteBuffers;
+    if(func != nullptr)
+    {
+        return func(n, buffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDispatchCompute(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)
+{
+    auto func = GLESSymbols::get().glDispatchCompute;
+    if(func != nullptr)
+    {
+        return func(num_groups_x, num_groups_y, num_groups_z);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glFlush(void)
+{
+    auto func = GLESSymbols::get().glFlush;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGenBuffers(GLsizei n, GLuint *buffers)
+{
+    auto func = GLESSymbols::get().glGenBuffers;
+    if(func != nullptr)
+    {
+        return func(n, buffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glGetProgramResourceIndex(GLuint program, GLenum programInterface, const GLchar *name)
+{
+    auto func = GLESSymbols::get().glGetProgramResourceIndex;
+    if(func != nullptr)
+    {
+        return func(program, programInterface, name);
+    }
+    else
+    {
+        return GL_INVALID_INDEX;
+    }
+}
+
+GLint GL_APIENTRY glGetUniformLocation(GLuint program, const GLchar *name)
+{
+    auto func = GLESSymbols::get().glGetUniformLocation;
+    if(func != nullptr)
+    {
+        return func(program, name);
+    }
+    else
+    {
+        return -1;
+    }
+}
+
+void *GL_APIENTRY glMapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access)
+{
+    auto func = GLESSymbols::get().glMapBufferRange;
+    if(func != nullptr)
+    {
+        return func(target, offset, length, access);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+void GL_APIENTRY glMemoryBarrier(GLbitfield barriers)
+{
+    auto func = GLESSymbols::get().glMemoryBarrier;
+    if(func != nullptr)
+    {
+        return func(barriers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUniform1ui(GLint location, GLuint v0)
+{
+    auto func = GLESSymbols::get().glUniform1ui;
+    if(func != nullptr)
+    {
+        return func(location, v0);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLboolean GL_APIENTRY glUnmapBuffer(GLenum target)
+{
+    auto func = GLESSymbols::get().glUnmapBuffer;
+    if(func != nullptr)
+    {
+        return func(target);
+    }
+    else
+    {
+        return GL_FALSE;
+    }
+}
+
+GLenum GL_APIENTRY glGetError(void)
+{
+    auto func = GLESSymbols::get().glGetError;
+    if(func != nullptr)
+    {
+        return func();
+    }
+    else
+    {
+        return GL_NO_ERROR;
+    }
+}
+
+void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
+{
+    auto func = GLESSymbols::get().glGetActiveUniformBlockiv;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockIndex, pname, params);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glUniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding)
+{
+    auto func = GLESSymbols::get().glUniformBlockBinding;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockIndex, uniformBlockBinding);
+    }
+    else
+    {
+        return;
+    }
+}
+
+GLuint GL_APIENTRY glGetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName)
+{
+    auto func = GLESSymbols::get().glGetUniformBlockIndex;
+    if(func != nullptr)
+    {
+        return func(program, uniformBlockName);
+    }
+    else
+    {
+        return GL_INVALID_INDEX;
+    }
+}
+
+void GL_APIENTRY glGenTextures(GLsizei n, GLuint *textures)
+{
+    auto func = GLESSymbols::get().glGenTextures;
+    if(func != nullptr)
+    {
+        return func(n, textures);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteTextures(GLsizei n, const GLuint *textures)
+{
+    auto func = GLESSymbols::get().glDeleteTextures;
+    if(func != nullptr)
+    {
+        return func(n, textures);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindTexture(GLenum target, GLuint texture)
+{
+    auto func = GLESSymbols::get().glBindTexture;
+    if(func != nullptr)
+    {
+        return func(target, texture);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels)
+{
+    auto func = GLESSymbols::get().glTexImage2D;
+    if(func != nullptr)
+    {
+        return func(target, level, internalformat, width, height, border, format, type, pixels);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glGenFramebuffers(GLsizei n, GLuint *framebuffers)
+{
+    auto func = GLESSymbols::get().glGenFramebuffers;
+    if(func != nullptr)
+    {
+        return func(n, framebuffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glDeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
+{
+    auto func = GLESSymbols::get().glDeleteFramebuffers;
+    if(func != nullptr)
+    {
+        return func(n, framebuffers);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glBindFramebuffer(GLenum target, GLuint framebuffer)
+{
+    auto func = GLESSymbols::get().glBindFramebuffer;
+    if(func != nullptr)
+    {
+        return func(target, framebuffer);
+    }
+    else
+    {
+        return;
+    }
+}
+
+void GL_APIENTRY glFramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level)
+{
+    auto func = GLESSymbols::get().glFramebufferTexture2D;
+    if(func != nullptr)
+    {
+        return func(target, attachment, textarget, texture, level);
+    }
+    else
+    {
+        return;
+    }
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
new file mode 100644
index 0000000..d06de3a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs

@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+/** Calculate the absolute difference of two input images.
+ *
+ * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: U8
+ * @param[in]  src1_attrs The attributes of the first source image
+ * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p in1_ptr
+ * @param[in]  src2_attrs The attributes of the second source image
+ * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p in1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination image
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src1_attrs;
+    ImageAttributes src2_attrs;
+    ImageAttributes dst_attrs;
+};
+
+TENSOR_DECLARATION(1, src1Buffer, uint, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(2, src2Buffer, uint, src2_ptr, src2_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
+void main(void)
+{
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
+    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    lowp uvec4 tmp1 = LOAD_UNPACK4_CURRENT_ITEM_U8(src1_ptr, src1_iter);
+    lowp uvec4 tmp2 = LOAD_UNPACK4_CURRENT_ITEM_U8(src2_ptr, src2_iter);
+    lowp uvec4 diff = uvec4(abs(ivec4(tmp1 - tmp2)));
+
+    STORE_PACK4_CURRENT_ITEM_U8(dst_ptr, dst_iter, diff);
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
new file mode 100644
index 0000000..38ba183
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs

@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+#elif defined(DATA_TYPE_FP16)
+#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
+precision highp float;
+#else  /*LOGISTIC_TANH_SRELU_SQRT*/
+precision mediump float;
+#endif /*LOGISTIC_TANH_SRELU_SQRT*/
+#endif /*DATA_TYPE_FP32*/
+
+#define ABS_OP(a) abs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define CONST_ONE (1.f)
+
+// Logistic Activation
+float logistic_op(float x)
+{
+    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+float tanh_op(float x)
+{
+    float tmp = float(B_VAL) * x;
+    if(tmp > 10.f)
+    {
+        return MUL_OP(float(A_VAL), 1.f);
+    }
+    else if(tmp < -10.f)
+    {
+        return MUL_OP(float(A_VAL), -1.f);
+    }
+    else
+    {
+        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
+    }
+}
+// RELU Tangent Activation
+float relu_op(float x)
+{
+    return max(0.f, x);
+}
+// Bounded RELU Activation
+float brelu_op(float x)
+{
+    return min(float(A_VAL), max(float(0.0), x));
+}
+// Lower Upper Bounded RELU Activation
+float lu_brelu_op(float x)
+{
+    return min(max(x, float(B_VAL)), float(A_VAL));
+}
+// Leaky RELU Activation
+float lrelu_op(float x)
+{
+    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
+}
+// Soft RELU Activation
+float srelu_op(float x)
+{
+    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+float abs_op(float x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+float square_op(float x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+float sqrt_op(float x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+float linear_op(float x)
+{
+    return MLA_OP(float(B_VAL), float(A_VAL), x);
+}
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#ifdef DATA_TYPE_FP32
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float data     = src_ptr[src.current_offset];
+    float data_out = 0.f;
+    // Perform activation
+
+#ifdef LOGISTIC
+    data_out = logistic_op(data);
+#elif defined(TANH)     /*LOGISTIC*/
+    data_out = tanh_op(data);
+#elif defined(RELU)     /*RELU*/
+    data_out = relu_op(data);
+#elif defined(BRELU)    /*BRELU*/
+    data_out = brelu_op(data);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+    data_out = lu_brelu_op(data);
+#elif defined(LRELU)    /*LRELU*/
+    data_out = lrelu_op(data);
+#elif defined(SRELU)    /*SRELU*/
+    data_out = srelu_op(data);
+#elif defined(ABS)      /*ABS*/
+    data_out = abs_op(data);
+#elif defined(SQUARE)   /*SQUARE*/
+    data_out = square_op(data);
+#elif defined(SQRT)     /*SQRT*/
+    data_out = sqrt_op(data);
+#elif defined(LINEAR)   /*LINEAR*/
+    data_out = linear_op(data);
+#else                   /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+    dst_ptr[dst.current_offset] = data_out;
+}
+
+#elif defined(DATA_TYPE_FP16)
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
+ * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    uint data = src_ptr[src.current_offset >> 2];
+    // Perform activation
+    float a = unpackHalf2x16(data).x;
+    float b = unpackHalf2x16(data).y;
+    vec2  data_out;
+#ifdef LOGISTIC         /*LOGISTIC*/
+    data_out.x = logistic_op(a);
+    data_out.y = logistic_op(b);
+#elif defined(TANH)     /*TANH*/
+    data_out.x = tanh_op(a);
+    data_out.y = tanh_op(b);
+#elif defined(RELU)     /*RELU*/
+    data_out.x = relu_op(a);
+    data_out.y = relu_op(b);
+#elif defined(BRELU)    /*BRELU*/
+    data_out.x = brelu_op(a);
+    data_out.y = brelu_op(b);
+#elif defined(LU_BRELU) /*LU_BRELU*/
+    data_out.x = lu_brelu_op(a);
+    data_out.y = lu_brelu_op(b);
+#elif defined(LRELU)    /*LRELU*/
+    data_out.x = lrelu_op(a);
+    data_out.y = lrelu_op(b);
+#elif defined(SRELU)    /*SRELU*/
+    data_out.x = srelu_op(a);
+    data_out.y = srelu_op(b);
+#elif defined(ABS)      /*ABS*/
+    data_out.x = abs_op(a);
+    data_out.y = abs_op(b);
+#elif defined(SQUARE)   /*SQUARE*/
+    data_out.x = square_op(a);
+    data_out.y = square_op(b);
+#elif defined(SQRT)     /*SQRT*/
+    data_out.x = sqrt_op(a);
+    data_out.y = sqrt_op(b);
+#elif defined(LINEAR)   /*LINEAR*/
+    data_out.x = linear_op(a);
+    data_out.y = linear_op(b);
+#else                   /*LOGISTIC*/
+#error Activation function not provided
+#endif /*LOGISTIC*/
+
+    dst_ptr[dst.current_offset >> 2] = packHalf2x16(data_out);
+}
+#endif /*DATA_TYPE_FP32*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
new file mode 100644
index 0000000..53fb515
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs

@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif /*DATA_TYPE_FP32*/
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) inversesqrt((a))
+#define SQCVT_SAT(a) (a)
+
+/** Apply batch normalization.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
+ *
+ * @param[in]  src_ptr     Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs   The attributes of the source tensor
+ * @param[out] dst_ptr     Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs   The attributes of the destination tensor
+ * @param[in]  mean_ptr    Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_attrs  The attributes of the mean tensor
+ * @param[in]  var_ptr     Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in]  var_attrs   The attributes of the var tensor
+ * @param[in]  beta_ptr    Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  beta_attrs  The attributes of the beta tensor
+ * @param[in]  gamma_ptr   Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  gamma_attrs The attributes of the gamma tensor
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    VectorAttributes   mean_attrs;
+    VectorAttributes   var_attrs;
+    VectorAttributes   beta_attrs;
+    VectorAttributes   gamma_attrs;
+};
+
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, meanBuffer, float, mean_ptr, mean_shift, 2, readonly);
+TENSOR_DECLARATION(4, varBuffer, float, var_ptr, var_shift, 2, readonly);
+TENSOR_DECLARATION(5, betaBuffer, float, beta_ptr, beta_shift, 2, readonly);
+TENSOR_DECLARATION(6, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
+
+void main(void)
+{
+    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter   = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+    VectorIterator   mean_iter  = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
+    VectorIterator   var_iter   = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+    VectorIterator   beta_iter  = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+    VectorIterator   gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+
+    float input_value = 0.f;
+    float denominator = 0.f;
+    float numerator   = 0.f;
+    float x_bar       = 0.f;
+    float gamma_param = 0.f;
+    float beta_param  = 0.f;
+
+    uint current_slice = gl_GlobalInvocationID.z;
+
+    input_value = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+    denominator = LOAD(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+    // Calculate x bar and store results
+    numerator = LOAD(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
+    numerator = SUB_OP(input_value, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+    gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
+    beta_param  = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+}
+
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
+TENSOR_DECLARATION(4, varBuffer, uvec2, var_ptr, var_shift, 3, readonly);
+TENSOR_DECLARATION(5, betaBuffer, uvec2, beta_ptr, beta_shift, 3, readonly);
+TENSOR_DECLARATION(6, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
+
+void main(void)
+{
+    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter   = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+    VectorIterator   mean_iter  = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
+    VectorIterator   var_iter   = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
+    VectorIterator   beta_iter  = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
+    VectorIterator   gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
+
+    vec4  unpacked_s[5];
+    float denominator;
+    float numerator;
+    float gamma_param;
+    float beta_param;
+    vec4  x_bar;
+    vec4  result;
+
+    uint current_slice = gl_GlobalInvocationID.z;
+    unpacked_s[0]      = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    unpacked_s[1]      = LOAD_UNPACK4_HALF(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
+    unpacked_s[2]      = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
+    unpacked_s[3]      = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * beta_attrs.stride_x));
+    unpacked_s[4]      = LOAD_UNPACK4_HALF(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
+
+    if((current_slice % uint(4)) == uint(0))
+    {
+        denominator = unpacked_s[1].x;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].x;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].x;
+        beta_param  = unpacked_s[4].x;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else if((current_slice % uint(4)) == uint(1))
+    {
+        denominator = unpacked_s[1].y;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].y;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].y;
+        beta_param  = unpacked_s[4].y;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else if((current_slice % uint(4)) == uint(2))
+    {
+        denominator = unpacked_s[1].z;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].z;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].z;
+        beta_param  = unpacked_s[4].z;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else
+    {
+        denominator = unpacked_s[1].w;
+        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+        //Calculate x bar and store results
+        numerator = unpacked_s[2].w;
+        x_bar     = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+        gamma_param = unpacked_s[3].w;
+        beta_param  = unpacked_s[4].w;
+        result      = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+}
+#endif /*DATA_TYPE_FP16*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
new file mode 100644
index 0000000..65000f2
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    dst_ptr[dst.current_offset + uint(OFFSETS_Z >> 2)] = src_ptr[tensor3D_offset(src, -OFFSETS_X, -OFFSETS_Y, 0)];
+}
+
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    uvec2 packed_s;
+    GC_LOAD1_3D_OFFSET(packed_s, src, -OFFSETS_X, -OFFSETS_Y, 0);
+    dst_ptr[(dst.current_offset + uint(OFFSETS_Z)) >> 3] = packed_s;
+}
+#endif /*DATA_TYPE_FP32*/
\ No newline at end of file

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
new file mode 100644
index 0000000..9976368
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+#ifdef IM2COL_GENERIC
+    TENSOR3D_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+    uint filter_depth;
+    uint src_stride_w;
+    uint dst_stride_w;
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+    TENSOR3D_PARAM_DECLARATION(src);
+    VECTOR_PARAM_DECLARATION(dst);
+    uint width;
+    uint height;
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+    IMAGE_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    uint width;
+#endif // COL2IM
+};
+
+#ifdef DATA_TYPE_FP16
+#if defined(IM2COL_REDUCED_8X)
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, restrict);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, restrict);
+#else                            /* IM2COL_REDUCED_8X */
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, restrict);
+#endif                           /* IM2COL_REDUCED_8X */
+
+precision mediump float;
+
+#ifdef IM2COL_REDUCED
+#if defined(IM2COL_REDUCED_GENERIC)
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D src_nostep     = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+    uint     image_size     = width * height;
+    uint     element_count  = src_step_x / src_stride_x;
+    uint     tmp_out_offset = dst.current_offset + ((pos.x * element_count + pos.y * width + pos.z * image_size) * dst.stride_x);
+    uint     width_fp16     = ((width + uint(1)) >> uint(1));
+    uint     tmp;
+
+    // odd width
+    if(width % uint(2) != uint(0))
+    {
+        // even row
+        if((pos.y + pos.z * height) % uint(2) == uint(0))
+        {
+            LOAD1(tmp, src, src.current_offset >> uint(2));
+            STORE1(dst, tmp_out_offset >> uint(2), tmp);
+        }
+        else
+        {
+            // special op
+            uint tmpleft  = uint(0);
+            uint tmpright = uint(0);
+            LOAD1(tmpright, src, src.current_offset >> uint(2)); // right half
+            if(pos.x == uint(0))
+            {
+                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, int(width), int(pos.y) - 1, int(pos.z)) >> uint(2)); // left half
+                tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16));
+            }
+            else
+            {
+                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)) >> uint(2)); // left half
+                tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16)));
+            }
+            STORE1(dst, tmp_out_offset >> uint(2), tmpright);
+        }
+    }
+    else
+    {
+        LOAD1(tmp, src, src.current_offset >> uint(2));
+        STORE1(dst, tmp_out_offset >> uint(2), tmp);
+    }
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+    {
+        tmp_out_offset += dst.stride_x;
+
+        mediump vec2 bias_vec = vec2(1.0f, 1.0f);
+        uint         bias_u   = packHalf2x16(bias_vec);
+        STORE1(dst, tmp_out_offset >> uint(2), bias_u);
+    }
+#endif // HAS_BIAS
+}
+#else /* IM2COL_REDUCED_GENERIC */
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+#if defined(IM2COL_REDUCED_8X)
+    uint     tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec4    tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(4));
+    STORE1(dst, tmp_out_offset >> uint(4), tmp);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+    uint  tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uvec2 tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(3));
+    STORE1(dst, tmp_out_offset >> uint(3), tmp);
+#else                            /* IM2COL_REDUCED_8X */
+    uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
+    uint tmp;
+    LOAD1(tmp, src, src.current_offset >> uint(2));
+    STORE1(dst, tmp_out_offset >> uint(2), tmp);
+#endif                           /* IM2COL_REDUCED_8X */
+}
+#endif                           /* IM2COL_REDUCED_GENERIC */
+#endif                           // IM2COL_REDUCED
+
+#elif defined(DATA_TYPE_FP32)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+
+#ifdef IM2COL_GENERIC
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  filter_depth                      The depth of the used filter
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+void main(void)
+{
+    uint xc    = gl_GlobalInvocationID.x;                // x coordinate in the convolved tensor
+    uint yc    = gl_GlobalInvocationID.y;                // y coordinate in the convolved tensor
+    uint ch    = gl_GlobalInvocationID.z % filter_depth; // input feature map
+    uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch
+
+    // Calculate input indeces
+    uint xi           = xc * uint(STRIDE_X) - uint(PAD_X);
+    uint yi           = yc * uint(STRIDE_Y) - uint(PAD_Y);
+    uint input_offset = (src_offset_first_element_in_bytes + (ch * src_stride_z) + (batch * src_stride_w)) >> uint(2);
+
+    // Calculate output indeces
+    uint xo            = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
+    uint yo            = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
+    uint output_offset = (dst_offset_first_element_in_bytes + (yo * dst_stride_y) + (batch * dst_stride_w) + xo) >> uint(2);
+
+    // Linearize convolution elements
+    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    {
+        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x)
+        {
+#if PAD_X == 0 && PAD_Y == 0
+            output_offset = input_offset + ((x * src_stride_x + y * src_stride_y) >> uint(2));
+            STORE4(dst, output_offset, LOAD4(src, input_offset));
+#else  // PAD_X == 0 && PAD_Y == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                STORE4(dst, output_offset, 0.0f);
+            }
+            else
+            {
+                output_offset = input_offset + (x * src_stride_x + y * src_stride_y) >> uint(2));
+                STORE4(dst, output_offset, LOAD4(src, input_offset));
+            }
+#endif // PAD_X == 0 && PAD_Y == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (uint(KERNEL_DEPTH) - 1))
+    {
+        STORE4(dst, output_offset, 1.0f);
+    }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_GENERIC
+
+#ifdef IM2COL_REDUCED
+/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  width                             The width of the input tensor
+ * @param[in]  height                            The height of the input tensor
+ */
+void main(void)
+{
+    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP(dst);
+    uint     image_size     = width * height;
+    uint     tmp_out_offset = dst.current_offset + (((pos.x + pos.y * width + pos.z * image_size) * dst.stride_x) >> 2);
+
+    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+    {
+        tmp_out_offset += (dst.stride_x >> uint(2));
+        STORE4(dst, tmp_out_offset, 1.f);
+    }
+#endif // HAS_BIAS
+}
+#endif // IM2COL_REDUCED
+
+#ifdef COL2IM
+/** This kernel performs a reshaping of the output of the convolution layer.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+void main(void)
+{
+    uvec2    pos = uvec2(gl_GlobalInvocationID.xy);
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    uint idx            = pos.x * dst.stride_z + (pos.y / width) * dst.stride_y + (pos.y % width) * dst.stride_x;
+    uint tmp_out_offset = dst.current_offset + (idx >> 2);
+
+    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+}
+#endif // COL2IM
+
+#else // DATA_TYPE_FP16
+#error Data type not supported
+#endif // DATA_TYPE_FP16

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
new file mode 100644
index 0000000..190d7d6
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs

@@ -0,0 +1,1010 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized direct convolution options for FP16.
+ *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
+#ifdef BIAS
+    VectorAttributes biases_attrs;
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    float pixels  = 0.f;
+    uint  z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+    float temp;
+    float temp_weight;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        temp        = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+        temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
+        pixels += temp * temp_weight;
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+
+#ifdef BIAS
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+#endif /* BIAS */
+
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
+}
+
+#elif defined(DATA_TYPE_FP16)
+#if defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve_stride1(ImageIterator src_iter, float w)
+{
+    vec4 s;
+    s = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+
+    s *= w;
+
+    return s;
+}
+
+vec4 convolve_stride2(ImageIterator src_iter, float w)
+{
+    vec4 s[2];
+    vec4 r;
+
+    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    r    = vec4(s[0].xz, s[1].xz);
+
+    r *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1      = vec2_w.x;
+        vec4 r1 = CONVOLVE(src_iter, w1);
+        pixels += r1;
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2      = vec2_w.y;
+        vec4 r2 = CONVOLVE(src_iter, w2);
+        pixels += r2;
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r = CONVOLVE(src_iter, w);
+        pixels += r;
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels += b;
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
+}
+#elif defined(PROCESS_4X_2Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
+{
+    vec4 s[2];
+    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 r[2];
+
+    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    r[0]  = vec4(s1[0].xz, s1[1].xz);
+
+    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
+    r[1]  = vec4(s2[0].xz, s2[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    pixels[0] = vec4(0.f);
+    pixels[1] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1         = vec2_w.x;
+        vec4 r1[2] = CONVOLVE(src_iter, w1);
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2         = vec2_w.y;
+        vec4 r2[2] = CONVOLVE(src_iter, w2);
+        pixels[0] += r2[0];
+        pixels[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r[2] = CONVOLVE(src_iter, w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels[0] += b;
+    pixels[1] += b;
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+}
+#elif defined(PROCESS_4X_3Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[3] convolve_stride1(ImageIterator src_iter, float w)
+{
+    vec4 s[3];
+    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+    s[2] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
+
+    s[0] *= w;
+    s[1] *= w;
+    s[2] *= w;
+
+    return s;
+}
+
+vec4[3] convolve_stride2(ImageIterator src_iter, float w)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 s3[2];
+    vec4 r[3];
+
+    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    r[0]  = vec4(s1[0].xz, s1[1].xz);
+
+    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
+    r[1]  = vec4(s2[0].xz, s2[1].xz);
+
+    s3[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
+    s3[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, (2 * int(STRIDE_Y))));
+    r[2]  = vec4(s3[0].xz, s3[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+    r[2] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[3];
+    pixels[0] = vec4(0.f);
+    pixels[1] = vec4(0.f);
+    pixels[2] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1         = vec2_w.x;
+        vec4 r1[3] = CONVOLVE(src_iter, w1);
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+        pixels[2] += r1[2];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2         = vec2_w.y;
+        vec4 r2[3] = CONVOLVE(src_iter, w2);
+        pixels[0] += r2[0];
+        pixels[1] += r2[1];
+        pixels[2] += r2[2];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r[3] = CONVOLVE(src_iter, w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+        pixels[2] += r[2];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels[0] += b;
+    pixels[1] += b;
+    pixels[2] += b;
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
+}
+#elif defined(PROCESS_4X_4Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
+{
+    vec4 s[2];
+    s[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 r[2];
+
+    s1[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
+    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), y1));
+    r[0]  = vec4(s1[0].xz, s1[1].xz);
+
+    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
+    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), (y1 + int(STRIDE_Y))));
+    r[1]  = vec4(s2[0].xz, s2[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    vec4 pixels1[2];
+    pixels[0]  = vec4(0.f);
+    pixels[1]  = vec4(0.f);
+    pixels1[0] = vec4(0.f);
+    pixels1[1] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1         = vec2_w.x;
+        vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
+        vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (2 * int(STRIDE_Y)));
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+        pixels1[0] += r2[0];
+        pixels1[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2         = vec2_w.y;
+        vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
+        vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (2 * int(STRIDE_Y)));
+        pixels[0] += r3[0];
+        pixels[1] += r3[1];
+        pixels1[0] += r4[0];
+        pixels1[1] += r4[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
+        vec4 r2[2] = CONVOLVE(src_iter, w, 0, (2 * int(STRIDE_Y)));
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+        pixels1[0] += r2[0];
+        pixels1[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels[0] += b;
+    pixels[1] += b;
+    pixels1[0] += b;
+    pixels1[1] += b;
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels1[1]);
+}
+#elif defined(PROCESS_4X_2Y_2Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
+{
+    vec4 s[2];
+    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 r[2];
+
+    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    r[0]  = vec4(s1[0].xz, s1[1].xz);
+
+    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
+    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
+    r[1]  = vec4(s2[0].xz, s2[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    uint z_base_index = uint(gl_GlobalInvocationID.z) << uint(1);
+
+    // store orginal src current offset
+    int s_offset_in_bytes = src_iter.current_offset_in_bytes;
+
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
+
+    for(int z = 0; z < 2; ++z)
+    {
+        uint z_index = z_base_index + uint(z);
+
+        src_iter.current_offset_in_bytes = s_offset_in_bytes;
+
+        vec4 pixels[2];
+        pixels[0] = vec4(0.f);
+        pixels[1] = vec4(0.f);
+
+#ifdef WEIGHTS_OPTIMIZATION
+        float w1, w2;
+        int   nums = (int(weights_depth)) / 2;
+        for(int d = 0; d < nums; ++d)
+        {
+            vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+            w1         = vec2_w.x;
+            vec4 r1[2] = CONVOLVE(src_iter, w1);
+            pixels[0] += r1[0];
+            pixels[1] += r1[1];
+
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+            w2         = vec2_w.y;
+            vec4 r2[2] = CONVOLVE(src_iter, w2);
+            pixels[0] += r2[0];
+            pixels[1] += r2[1];
+
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+        }
+#else  /* WEIGHTS_OPTIMIZATION */
+        float w;
+        for(int d = 0; d < int(weights_depth); ++d)
+        {
+            w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+            vec4 r[2] = CONVOLVE(src_iter, w);
+            pixels[0] += r[0];
+            pixels[1] += r[1];
+
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+        }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+        vec2  vec2_b;
+        float b;
+
+        vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+        if(z_index % uint(2) == uint(0))
+        {
+            b = vec2_b.x;
+        }
+        else
+        {
+            b = vec2_b.y;
+        }
+
+        pixels[0] += b;
+        pixels[1] += b;
+#endif /* BIAS */
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_z);
+    }
+}
+#elif defined(PROCESS_8X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w) convolve_stride2(s, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w) convolve_stride1(s, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
+{
+    vec4 s[2];
+    s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 r[2];
+
+    s1   = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    r[0] = vec4(s1[0].xz, s1[1].xz);
+    s2   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
+    r[1] = vec4(s2[0].xz, s2[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    pixels[0] = vec4(0.f);
+    pixels[1] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1         = vec2_w.x;
+        vec4 r1[2] = CONVOLVE(src_iter, w1);
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2         = vec2_w.y;
+        vec4 r2[2] = CONVOLVE(src_iter, w2);
+        pixels[0] += r2[0];
+        pixels[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r[2] = CONVOLVE(src_iter, w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels[0] += b;
+    pixels[1] += b;
+#endif /* BIAS */
+
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
+}
+#elif defined(PROCESS_8X_2Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
+{
+    vec4 s[2];
+    s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
+
+    s[0] *= w;
+    s[1] *= w;
+
+    return s;
+}
+
+vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
+{
+    vec4 s1[2];
+    vec4 s2[2];
+    vec4 r[2];
+
+    s1   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
+    r[0] = vec4(s1[0].xz, s1[1].xz);
+    s2   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, (8 + x1), y1));
+    r[1] = vec4(s2[0].xz, s2[1].xz);
+
+    r[0] *= w;
+    r[1] *= w;
+
+    return r;
+}
+
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    vec4 pixels1[2];
+    pixels[0]  = vec4(0.f);
+    pixels[1]  = vec4(0.f);
+    pixels1[0] = vec4(0.f);
+    pixels1[1] = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
+#ifdef WEIGHTS_OPTIMIZATION
+    float w1, w2;
+    int   nums = (int(weights_depth)) / 2;
+    for(int d = 0; d < nums; ++d)
+    {
+        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+
+        w1         = vec2_w.x;
+        vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
+        vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (int(STRIDE_Y)));
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+        pixels1[0] += r2[0];
+        pixels1[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+
+        w2         = vec2_w.y;
+        vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
+        vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (int(STRIDE_Y)));
+        pixels[0] += r3[0];
+        pixels[1] += r3[1];
+        pixels1[0] += r4[0];
+        pixels1[1] += r4[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
+    }
+#else  /* WEIGHTS_OPTIMIZATION */
+    float w;
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
+
+        vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
+        vec4 r2[2] = CONVOLVE(src_iter, w, 0, (int(STRIDE_Y)));
+        pixels[0] += r1[0];
+        pixels[1] += r1[1];
+        pixels1[0] += r2[0];
+        pixels1[1] += r2[1];
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
+    }
+#endif /* WEIGHTS_OPTIMIZATION */
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels[0] += b;
+    pixels[1] += b;
+    pixels1[0] += b;
+    pixels1[1] += b;
+#endif /* BIAS */
+
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
+    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
+}
+#endif /* PROCESS_4X_1Y_1Z */
+#else  /* DATA_TYPE_F32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
new file mode 100644
index 0000000..d450ac1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs

@@ -0,0 +1,1583 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+#define LOAD12(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2))
+
+#define LOAD3X3(r, name)                                \
+    r[0] = LOAD4(name, tensor3D_offset(name, 0, 0, 0)); \
+    r[1] = LOAD4(name, tensor3D_offset(name, 1, 0, 0)); \
+    r[2] = LOAD4(name, tensor3D_offset(name, 2, 0, 0)); \
+    r[3] = LOAD4(name, tensor3D_offset(name, 0, 1, 0)); \
+    r[4] = LOAD4(name, tensor3D_offset(name, 1, 1, 0)); \
+    r[5] = LOAD4(name, tensor3D_offset(name, 2, 1, 0)); \
+    r[6] = LOAD4(name, tensor3D_offset(name, 0, 2, 0)); \
+    r[7] = LOAD4(name, tensor3D_offset(name, 1, 2, 0)); \
+    r[8] = LOAD4(name, tensor3D_offset(name, 2, 2, 0))
+
+#if defined(PROCESS_1_ELEMENT)
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    float pixels = CONVERT(0, float);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        vec3 temp;
+        vec3 w;
+
+        LOAD12(temp, src, offset(src, 0, 0));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        LOAD12(temp, src, offset(src, 0, 1));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        LOAD12(temp, src, offset(src, 0, 2));
+        LOAD12(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+#elif defined(PROCESS_8_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4[2] convolve1x3_stride1(uint offset, vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 tmp[3];
+    vec4 r[2];
+
+    LOAD3(tmp, src, offset);
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    middle = vec4(tmp[1].yzw, tmp[2].x);
+    right  = vec4(tmp[1].zw, tmp[2].xy);
+
+    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] convolve1x3_stride2(uint offset, vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+    vec4 tmp[3];
+    vec4 r[2];
+
+    LOAD3(tmp, src, offset);
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    r[0] = left * w[0] + middle * w[1] + right * w[2];
+
+    LOAD2(tmp, src, offset + ((uint(3) * src_stride_x) >> 2));
+
+    left   = vec4(tmp[2].xz, tmp[0].xz);
+    middle = vec4(tmp[2].yw, tmp[0].yw);
+    right  = vec4(tmp[2].z, tmp[0].xz, tmp[1].x);
+
+    r[1] = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels[2];
+    pixels[0] = vec4(0);
+    pixels[1] = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w;
+        vec4 r[2];
+
+        // first line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        r = CONVOLVE1x3(src.current_offset >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        // second line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        // third line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+        pixels[0] += r[0];
+        pixels[1] += r[1];
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    pixels[0] += vec4(b);
+    pixels[1] += vec4(b);
+#endif /* BIAS */
+
+    STORE2(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_4_ELEMENT)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(uint offset, vec3 w)
+{
+    vec4 tmp[2];
+    vec4 middle;
+    vec4 right;
+
+    LOAD2(tmp, src, offset);
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return tmp[1];
+}
+
+vec4 convolve1x3_stride2(uint offset, vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+
+    vec4 tmp[3];
+
+    LOAD3(tmp, src, offset);
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    tmp[0] = left * w[0] + middle * w[1] + right * w[2];
+
+    return tmp[0];
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels;
+    pixels = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w;
+
+        // first line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+
+        pixels += CONVOLVE1x3(src.current_offset >> uint(2), w);
+
+        // second line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+
+        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+
+        // third line
+        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+
+        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    pixels += vec4(b);
+#endif /* BIAS */
+
+    STORE1(dst, dst.current_offset >> uint(2), pixels);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS)
+BUFFER_DECLARATION(src, 1, vec4, readonly);
+BUFFER_DECLARATION(dst, 2, vec4, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
+
+vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
+{
+    vec4 r;
+
+    r = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4 pixels[3];
+    pixels[0] = vec4(0);
+    pixels[1] = vec4(0);
+    pixels[2] = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w >> 2;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        vec3 w[3];
+
+        LOAD3(w[0], weights, tensor3D_offset(weights, 0, 0, 0));
+        LOAD3(w[1], weights, tensor3D_offset(weights, 0, 1, 0));
+        LOAD3(w[2], weights, tensor3D_offset(weights, 0, 2, 0));
+
+        vec4 s[2];
+        vec4 middle;
+        vec4 right;
+        // first line
+        LOAD2(s, src, src.current_offset >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // second line
+        LOAD2(s, src, (src.current_offset + (src_stride_y >> 2)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // third line
+        LOAD2(s, src, (src.current_offset + (src_stride_y >> 1)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
+
+        // forth line
+        LOAD2(s, src, (src.current_offset + (uint(3) * (src_stride_y >> 2))) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
+
+        // fifth line
+        LOAD2(s, src, (src.current_offset + (src_stride_y)) >> uint(2));
+        middle = vec4(s[0].yzw, s[1].x);
+        right  = vec4(s[0].zw, s[1].xy);
+        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
+
+        src.current_offset += src_stride_z >> 2;
+        weights.current_offset += weights_stride_z >> 2;
+    }
+
+#ifdef BIAS
+    float b;
+    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+
+    pixels[0] += vec4(b);
+    pixels[1] += vec4(b);
+    pixels[2] += vec4(b);
+#endif /* BIAS */
+
+    STORE1(dst, dst.current_offset >> uint(2), pixels[0]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y >> 2)) >> uint(2), pixels[1]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y >> 1)) >> uint(2), pixels[2]);
+}
+#elif defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r[2];
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    middle = vec4(tmp[1].yzw, tmp[2].x);
+    right  = vec4(tmp[1].zw, tmp[2].xy);
+
+    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[3] load_and_unpack(uint offset)
+{
+    uvec4 packed_s[2];
+    vec4  s[3];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+    ;
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
+    s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d[2];
+    uvec4 vd;
+
+    vec4 pixels[3][2];
+    int  i, j;
+    for(i = 0; i < 3; i++)
+    {
+        for(j = 0; j < 2; j++)
+        {
+            pixels[i][j] = vec4(0);
+        }
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        uvec4 packed_s[2];
+        vec4  s[3];
+        vec4  r[2];
+        uint  offset;
+        // first line
+        offset = src.current_offset >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[0][0] += r[0];
+        pixels[0][1] += r[1];
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+        r = CONVOLVE1x3(s, w[0]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[1][0] += r[0];
+        pixels[1][1] += r[1];
+        r = CONVOLVE1x3(s, w[1]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(4);
+        s      = load_and_unpack(offset);
+
+        r = CONVOLVE1x3(s, w[2]);
+        pixels[2][0] += r[0];
+        pixels[2][1] += r[1];
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 3; i++)
+    {
+        for(j = 0; j < 2; j++)
+        {
+            pixels[i][j] += vec4(b);
+        }
+    }
+#endif /* BIAS */
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[0][0].xy), packHalf2x16(pixels[0][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[0][1].xy), packHalf2x16(pixels[0][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, dst.current_offset >> uint(4), vd);
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[1][0].xy), packHalf2x16(pixels[1][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[1][1].xy), packHalf2x16(pixels[1][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(4), vd);
+
+    packed_d[0] = uvec2(packHalf2x16(pixels[2][0].xy), packHalf2x16(pixels[2][0].zw));
+    packed_d[1] = uvec2(packHalf2x16(pixels[2][1].xy), packHalf2x16(pixels[2][1].zw));
+    vd          = uvec4(packed_d[0], packed_d[1]);
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(4), vd);
+}
+#elif defined(PROCESS_X_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 2
+#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride2(offset)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+#define LOAD_AND_UNPACK(offset) load_and_unpack_stride1(offset)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w)
+{
+    vec4 left;
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    left   = vec4(tmp[0].xz, tmp[1].xz);
+    middle = vec4(tmp[0].yw, tmp[1].yw);
+    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+
+    r = left * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack_stride1(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+vec4[3] load_and_unpack_stride2(uint offset)
+{
+    uvec2 packed_s[3];
+    vec4  s[3];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+    LOAD1(packed_s[2], src, offset + uint(2));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels = vec4(0);
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+#if STRIDE_X == 2
+        vec4 s[3];
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+        vec4 s[2];
+#else               /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[1]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = LOAD_AND_UNPACK(offset);
+
+        pixels += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    pixels += vec4(b);
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels.xy), packHalf2x16(pixels.zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[3];
+    int  i;
+
+    for(i = 0; i < 3; i++)
+    {
+        pixels[i] = vec4(0);
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        vec4 s[2];
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[1]);
+        pixels[1] += CONVOLVE1x3(s, w[0]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[2]);
+        pixels[1] += CONVOLVE1x3(s, w[1]);
+        pixels[2] += CONVOLVE1x3(s, w[0]);
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[1] += CONVOLVE1x3(s, w[2]);
+        pixels[2] += CONVOLVE1x3(s, w[1]);
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[2] += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 3; i++)
+    {
+        pixels[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x4 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[4];
+    int  i;
+
+    for(i = 0; i < 4; i++)
+    {
+        pixels[i] = vec4(0);
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load 3 weights once
+        uvec2 packed_w[3];
+
+        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+        vec3 w[3];
+        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+        vec4 s[2];
+        vec4 r;
+        uint offset;
+        // first line
+        offset = src.current_offset >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[0]);
+
+        // second line
+        offset = (src.current_offset + src_stride_y) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[1]);
+        pixels[1] += CONVOLVE1x3(s, w[0]);
+
+        // third line
+        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[0] += CONVOLVE1x3(s, w[2]);
+        pixels[1] += CONVOLVE1x3(s, w[1]);
+        pixels[2] += CONVOLVE1x3(s, w[0]);
+
+        // forth line
+        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[1] += CONVOLVE1x3(s, w[2]);
+        pixels[2] += CONVOLVE1x3(s, w[1]);
+        pixels[3] += CONVOLVE1x3(s, w[0]);
+
+        // fifth line
+        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[2] += CONVOLVE1x3(s, w[2]);
+        pixels[3] += CONVOLVE1x3(s, w[1]);
+
+        // sixth line
+        offset = (src.current_offset + uint(5) * (src_stride_y)) >> uint(3);
+        s      = load_and_unpack(offset);
+
+        pixels[3] += CONVOLVE1x3(s, w[2]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = unpackHalf2x16(packed_b).x;
+    }
+    else
+    {
+        b = unpackHalf2x16(packed_b).y;
+    }
+
+    for(i = 0; i < 4; i++)
+    {
+        pixels[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+    packed_d = uvec2(packHalf2x16(pixels[3].xy), packHalf2x16(pixels[3].zw));
+    STORE1(dst, (dst.current_offset + uint(3) * (dst_stride_y)) >> uint(3), packed_d);
+}
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
+
+vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
+{
+    vec4 middle;
+    vec4 right;
+    vec4 r;
+
+    middle = vec4(tmp[0].yzw, tmp[1].x);
+    right  = vec4(tmp[0].zw, tmp[1].xy);
+
+    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
+
+    return r;
+}
+
+vec4[2] load_and_unpack(uint offset)
+{
+    uvec2 packed_s[2];
+    vec4  s[2];
+
+    LOAD1(packed_s[0], src, offset);
+    LOAD1(packed_s[1], src, offset + uint(1));
+
+    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    return s;
+}
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3x2 elements at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+#ifdef BIAS
+    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+#endif /* BIAS */
+
+    uvec2 packed_d;
+
+    vec4 pixels[3];
+    int  i;
+
+    uint z_base_index = gl_GlobalInvocationID.z << 1;
+
+    // store orginal src current offset
+    uint s_offset = src.current_offset;
+
+    weights.current_offset += z_base_index * weights_stride_w;
+
+    for(int z = 0; z < 2; ++z)
+    {
+        uint z_index = z_base_index + uint(z);
+
+        src.current_offset = s_offset;
+        //weights.current_offset = z_index * weights_stride_w;
+
+        for(i = 0; i < 3; i++)
+        {
+            pixels[i] = vec4(0);
+        }
+
+        for(int d = 0; d < int(weights_depth); ++d)
+        {
+            // load 3 weights once
+            uvec2 packed_w[3];
+
+            LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
+            LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
+            LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+
+            vec3 w[3];
+            w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
+            w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
+            w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
+
+            vec4 s[2];
+            vec4 r;
+            uint offset;
+            // first line
+            offset = src.current_offset >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[0]);
+
+            // second line
+            offset = (src.current_offset + src_stride_y) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[1]);
+            pixels[1] += CONVOLVE1x3(s, w[0]);
+
+            // third line
+            offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[0] += CONVOLVE1x3(s, w[2]);
+            pixels[1] += CONVOLVE1x3(s, w[1]);
+            pixels[2] += CONVOLVE1x3(s, w[0]);
+
+            // forth line
+            offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[1] += CONVOLVE1x3(s, w[2]);
+            pixels[2] += CONVOLVE1x3(s, w[1]);
+
+            // fifth line
+            offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
+            s      = load_and_unpack(offset);
+
+            pixels[2] += CONVOLVE1x3(s, w[2]);
+
+            src.current_offset += src_stride_z;
+            weights.current_offset += weights_stride_z;
+        }
+
+#ifdef BIAS
+        uint  packed_b;
+        float b;
+        LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+        if(z_index % uint(2) == uint(0))
+        {
+            b = unpackHalf2x16(packed_b).x;
+        }
+        else
+        {
+            b = unpackHalf2x16(packed_b).y;
+        }
+
+        for(i = 0; i < 3; i++)
+        {
+            pixels[i] += vec4(b);
+        }
+#endif /* BIAS */
+
+        packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
+        STORE1(dst, dst.current_offset >> uint(3), packed_d);
+
+        packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
+        STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
+
+        packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
+        STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+
+        dst.current_offset += dst_stride_z;
+    }
+}
+#endif /* PROCESS_1_ELEMENT */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
new file mode 100644
index 0000000..a36bd43
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs

@@ -0,0 +1,1031 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+/** This kernel performs a direct convolution to convolve the low three dimensions
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, float, readonly);
+#endif /* BIAS */
+
+#define LOAD20(r, name, offset)           \
+    r[0] = LOAD4(name, offset);           \
+    r[1] = LOAD4(name, offset + uint(1)); \
+    r[2] = LOAD4(name, offset + uint(2)); \
+    r[3] = LOAD4(name, offset + uint(3)); \
+    r[4] = LOAD4(name, offset + uint(4))
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    float pixels  = CONVERT(0, float);
+    uint  z_index = gl_GlobalInvocationID.z;
+    weights.current_offset += z_index * weights_stride_w >> 2;
+    float temp[5];
+    float temp_weight[5];
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        LOAD20(temp, src, offset(src, 0, 0));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 1));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 2));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 3));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        LOAD20(temp, src, offset(src, 0, 4));
+        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
+
+        src.current_offset += (src_stride_z >> 2);
+        weights.current_offset += (weights_stride_z >> 2);
+    }
+
+#ifdef BIAS
+    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+#endif /* BIAS */
+
+    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+#if defined(PROCESS_4X_1Y_1Z)
+
+/** This kernel performs a direct convolution to convolve the low three dimensions
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(biases, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[2] load_src_stride1(Image src, int row)
+{
+    uvec2 packed[2];
+    vec4  ret[2];
+
+    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+    return ret;
+}
+
+vec4[3] load_src_stride2(Image src, int row)
+{
+    uvec2 packed[3];
+    vec4  ret[3];
+
+    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
+
+    return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+    uvec3 packed_w;
+    vec2  ret[3];
+
+    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+    return ret;
+}
+
+vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
+{
+    vec4 src0 = tmp[0];
+    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+    vec4 src4 = tmp[1];
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
+{
+    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
+    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
+    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
+    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* BIAS */
+
+    vec4  res = vec4(0);
+    vec2  w[3];
+    vec4  s[STRIDE_X + 1];
+    uvec2 packed_d;
+    uint  z_index = gl_GlobalInvocationID.z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        for(int row = 0; row < 5; row++)
+        {
+            w = load_weight(weights, row);
+            s = LOAD_SRC(src, row);
+            res += CONVOLVE1x5(s, w);
+        }
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+
+    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+    res += vec4(b);
+#endif /* BIAS */
+
+    packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
+    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+
+#elif defined(PROCESS_4X_3Y_1Z)
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(bias, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[2] load_src_stride1(Image src, int row)
+{
+    uvec2 packed[2];
+    vec4  ret[2];
+
+    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+    return ret;
+}
+
+vec4[3] load_src_stride2(Image src, int row)
+{
+    uvec2 packed[3];
+    vec4  ret[3];
+
+    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
+
+    return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+    uvec3 packed_w;
+    vec2  ret[3];
+
+    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+    return ret;
+}
+
+vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
+{
+    vec4 src0 = tmp[0];
+    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+    vec4 src4 = tmp[1];
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
+{
+    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
+    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
+    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
+    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+#endif /* BIAS */
+
+    vec4  res[3];
+    vec2  w[5][3];
+    vec4  s[STRIDE_X + 1];
+    uvec2 packed_d;
+    uint  z_index = gl_GlobalInvocationID.z;
+    int   i;
+
+    for(i = 0; i < 3; i++)
+    {
+        res[i] = vec4(0);
+    }
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        // load weights once
+        for(int row = 0; row < 5; row++)
+        {
+            w[row] = load_weight(weights, row);
+        }
+
+        // 1st line
+        s = LOAD_SRC(src, 0);
+        res[0] += CONVOLVE1x5(s, w[0]);
+
+        // 2nd line
+        s = LOAD_SRC(src, 1);
+        res[0] += CONVOLVE1x5(s, w[1]);
+        res[1] += CONVOLVE1x5(s, w[0]);
+
+        // 3rd line
+        s = LOAD_SRC(src, 2);
+        res[0] += CONVOLVE1x5(s, w[2]);
+        res[1] += CONVOLVE1x5(s, w[1]);
+        res[2] += CONVOLVE1x5(s, w[0]);
+
+        // 4th line
+        s = LOAD_SRC(src, 3);
+        res[0] += CONVOLVE1x5(s, w[3]);
+        res[1] += CONVOLVE1x5(s, w[2]);
+        res[2] += CONVOLVE1x5(s, w[1]);
+
+        // 5th line
+        s = LOAD_SRC(src, 4);
+        res[0] += CONVOLVE1x5(s, w[4]);
+        res[1] += CONVOLVE1x5(s, w[3]);
+        res[2] += CONVOLVE1x5(s, w[2]);
+
+        // 6th line
+        s = LOAD_SRC(src, 5);
+        res[1] += CONVOLVE1x5(s, w[4]);
+        res[2] += CONVOLVE1x5(s, w[3]);
+
+        // 7th line
+        s = LOAD_SRC(src, 6);
+        res[2] += CONVOLVE1x5(s, w[4]);
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+
+    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
+    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+    for(i = 0; i < 3; i++)
+    {
+        res[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    for(i = 0; i < 3; i++)
+    {
+        packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
+        GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
+    }
+}
+
+#elif defined(PROCESS_4X_3Y_2Z)
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(bias, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[2] load_src_stride1(Image src, int row)
+{
+    uvec2 packed[2];
+    vec4  ret[2];
+
+    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+    return ret;
+}
+
+vec4[3] load_src_stride2(Image src, int row)
+{
+    uvec2 packed[3];
+    vec4  ret[3];
+
+    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
+
+    return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+    uvec3 packed_w;
+    vec2  ret[3];
+
+    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+    return ret;
+}
+
+vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
+{
+    vec4 src0 = tmp[0];
+    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+    vec4 src4 = tmp[1];
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
+{
+    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
+    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
+    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
+    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
+    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+#endif /* BIAS */
+
+    vec4  res[3];
+    vec2  w[5][3];
+    vec4  s[STRIDE_X + 1];
+    uvec2 packed_d;
+    uint  z_index  = (gl_GlobalInvocationID.z);
+    uint  s_offset = src.current_offset;
+    int   i, z;
+
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(z = 0; z < 2; z++)
+    {
+        z_index += uint(z);
+        src.current_offset = s_offset;
+
+        for(i = 0; i < 3; i++)
+        {
+            res[i] = vec4(0);
+        }
+
+        for(int d = 0; d < int(weights_depth); ++d)
+        {
+            // load weights once
+            for(int row = 0; row < 5; row++)
+            {
+                w[row] = load_weight(weights, row);
+            }
+
+            // 1st line
+            s = LOAD_SRC(src, 0);
+            res[0] += CONVOLVE1x5(s, w[0]);
+
+            // 2nd line
+            s = LOAD_SRC(src, 1);
+            res[0] += CONVOLVE1x5(s, w[1]);
+            res[1] += CONVOLVE1x5(s, w[0]);
+
+            // 3rd line
+            s = LOAD_SRC(src, 2);
+            res[0] += CONVOLVE1x5(s, w[2]);
+            res[1] += CONVOLVE1x5(s, w[1]);
+            res[2] += CONVOLVE1x5(s, w[0]);
+
+            // 4th line
+            s = LOAD_SRC(src, 3);
+            res[0] += CONVOLVE1x5(s, w[3]);
+            res[1] += CONVOLVE1x5(s, w[2]);
+            res[2] += CONVOLVE1x5(s, w[1]);
+
+            // 5th line
+            s = LOAD_SRC(src, 4);
+            res[0] += CONVOLVE1x5(s, w[4]);
+            res[1] += CONVOLVE1x5(s, w[3]);
+            res[2] += CONVOLVE1x5(s, w[2]);
+
+            // 6th line
+            s = LOAD_SRC(src, 5);
+            res[1] += CONVOLVE1x5(s, w[4]);
+            res[2] += CONVOLVE1x5(s, w[3]);
+
+            // 7th line
+            s = LOAD_SRC(src, 6);
+            res[2] += CONVOLVE1x5(s, w[4]);
+
+            src.current_offset += src_stride_z;
+            weights.current_offset += weights_stride_z;
+        }
+
+#ifdef BIAS
+        uint  packed_b;
+        float b;
+
+        GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
+        b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+        for(i = 0; i < 3; i++)
+        {
+            res[i] += vec4(b);
+        }
+#endif /* BIAS */
+
+        for(i = 0; i < 3; i++)
+        {
+            packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
+            GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
+        }
+
+        dst.current_offset += dst_stride_z;
+    }
+}
+
+#elif defined(PROCESS_8X_1Y_1Z)
+
+/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
+ *
+ * @note This OpenGL ES shader works with stride_x = 1
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ */
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+    TENSOR3D_PARAM_DECLARATION(weights);
+#ifdef BIAS
+    VECTOR_PARAM_DECLARATION(biases);
+#endif /* BIAS */
+    uint weights_stride_w;
+    uint weights_depth;
+};
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+BUFFER_DECLARATION(weights, 3, uint, readonly);
+#ifdef BIAS
+BUFFER_DECLARATION(bias, 4, uint, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 1
+#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#error stride == 2 for PROCESS_8X_1Y not implemented
+#else /* STRDIDE_X == 1 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 1 */
+
+vec4[3] load_src_stride1(Image src, int row)
+{
+    uvec4 packed[2];
+    vec4  ret[3];
+
+    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
+
+    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
+    ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
+    ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
+
+    return ret;
+}
+
+vec2[3] load_weight(Tensor3D weights, int row)
+{
+    uvec3 packed_w;
+    vec2  ret[3];
+
+    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
+
+    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
+    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
+    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
+
+    return ret;
+}
+
+vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
+{
+    vec4 src0 = tmp[0];
+    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
+    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
+    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
+    vec4 src4 = tmp[1];
+    vec4 ret[2];
+
+    ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    src0   = tmp[1];
+    src1   = vec4(tmp[1].yzw, tmp[2].x);
+    src2   = vec4(tmp[1].zw, tmp[2].xy);
+    src3   = vec4(tmp[1].w, tmp[2].xyz);
+    src4   = tmp[2];
+    ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
+
+    return ret;
+}
+
+void main()
+{
+    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef BIAS
+    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+#endif /* BIAS */
+
+    vec4  res[2];
+    vec2  w[3];
+    vec4  s[STRIDE_X + 2];
+    uvec4 packed_d;
+    uint  z_index = gl_GlobalInvocationID.z;
+
+    res[0] = vec4(0);
+    res[1] = vec4(0);
+    weights.current_offset += z_index * weights_stride_w;
+
+    for(int d = 0; d < int(weights_depth); ++d)
+    {
+        for(int row = 0; row < 5; row++)
+        {
+            w = load_weight(weights, row);
+            s = LOAD_SRC(src, row);
+            res[0] += CONVOLVE1x5(s, w)[0];
+            res[1] += CONVOLVE1x5(s, w)[1];
+        }
+
+        src.current_offset += src_stride_z;
+        weights.current_offset += weights_stride_z;
+    }
+
+#ifdef BIAS
+    uint  packed_b;
+    float b;
+
+    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
+    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+    res[0] += vec4(b);
+    res[1] += vec4(b);
+#endif /* BIAS */
+
+    packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
+    packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
+    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+}
+
+#else /* defined(PROCESS_4X_1Y_1Z) */
+
+#endif /* defined(PROCESS_4X_1Y_1Z) */
+
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
new file mode 100644
index 0000000..54e08b1
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs

@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(mask);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+uint hash(uint x)
+{
+    x += (x << 10u);
+    x ^= (x >> 6u);
+    x += (x << 3u);
+    x ^= (x >> 11u);
+    x += (x << 15u);
+    return x;
+}
+
+uint hash(uvec3 v)
+{
+    return hash(v.x ^ hash(v.y) ^ hash(v.z));
+}
+
+float float_construct(uint m)
+{
+    const uint ieee_mantissa = 0x007FFFFFu;
+    const uint ieee_one      = 0x3F800000u;
+
+    m &= ieee_mantissa;
+    m |= ieee_one;
+
+    float f = uintBitsToFloat(m);
+    return f - 1.0;
+}
+
+float rand(vec3 v, float seed)
+{
+    return float_construct(hash(floatBitsToUint(v + seed)));
+}
+
+#ifdef DATA_TYPE_FP32
+
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(mask, 2, float, );
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
+ * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float random  = 0.f;
+    float inputv  = 0.f;
+    float maskv   = 0.f;
+    float outputv = 0.f;
+
+#ifdef FORWARD
+    random = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+    maskv  = (random > RATIO) ? 1.f : 0.f;
+    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else  /* FORWARD */
+    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#endif /* FORWARD */
+
+    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+    outputv = maskv * inputv * float(SCALE);
+    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(mask, 2, uint, );
+BUFFER_DECLARATION(dst, 3, uint, writeonly);
+
+/** Dropout is used to improve over-fit on neural networks.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
+ * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
+ * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
+ * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
+    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float random1    = 0.f;
+    float random2    = 0.f;
+    uint  inputv     = uint(0);
+    uint  outputv    = uint(0);
+    uint  maskv      = uint(0);
+    vec2  input_vec  = vec2(0, 0);
+    vec2  output_vec = vec2(0, 0);
+    vec2  mask_vec   = vec2(0, 0);
+
+#ifdef FORWARD
+    random1          = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
+    random2          = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED);
+    mask_vec.x       = (random1 > RATIO) ? 1.f : 0.f;
+    mask_vec.y       = (random2 > RATIO) ? 1.f : 0.f;
+    maskv            = packHalf2x16(mask_vec);
+    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+#else  /* FORWARD */
+    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+    mask_vec = unpackHalf2x16(maskv);
+#endif /* FORWARD */
+
+    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+
+    input_vec  = unpackHalf2x16(inputv);
+    output_vec = mask_vec * input_vec * float(SCALE);
+    outputv    = packHalf2x16(output_vec);
+
+    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+}
+
+#else /* DATA_TYPE_FP32 */
+
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
new file mode 100644
index 0000000..c64572b
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs

@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+#ifdef FILL_IMAGE_BORDERS_REPLICATE
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr     Pointer to the source image. Supported data types: F16/F32
+ * @param[in]     buf_attrs   The attributes of the source image
+ * @param[in]     width       Width of the valid region of the image
+ * @param[in]     height      Height of the valid region of the image
+ * @param[in]     start_pos_x X coordinate indicating the start point of the valid region
+ * @param[in]     start_pos_y Y coordinate indicating the start point of the valid region
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes buf_attrs;
+    uint               width;
+    uint               height;
+    int                start_pos_x;
+    int                start_pos_y;
+};
+
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, bufBuffer, float, buf_ptr, buf_shift, 2, restrict);
+
+void main()
+{
+    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
+
+    // Update pointer to point to the starting point of the valid region
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        float left_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, 0, gidH));
+        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, -(i + 1), gidH), left_val);
+        }
+        // Handle right border
+        float right_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) + i, gidH), right_val);
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (int(width) - 1))
+        {
+            val_idx = gidW < 0 ? 0 : int(width) - 1;
+        }
+
+        // Handle top border
+        float top_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, 0));
+        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, -(i + 1)), top_val);
+        }
+        // Handle bottom border
+        float bottom_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, int(height) - 1));
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, int(height) + i), bottom_val);
+        }
+    }
+}
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, bufBuffer, uint, buf_ptr, buf_shift, 2, restrict);
+
+void set_replicate(uint offset, int pos, vec2 replicate_value)
+{
+    vec2 b = LOAD_UNPACK2_HALF(buf_ptr, offset);
+
+    if(pos % 2 == 0)
+    {
+        b.x = replicate_value.y;
+    }
+    else
+    {
+        b.y = replicate_value.x;
+    }
+
+    STORE_PACK2_HALF(buf_ptr, offset, b);
+}
+
+void main()
+{
+    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
+
+    // Update pointer to point to the starting point of the valid region
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, uint(start_pos_y) * buf_attrs.stride_y + uint(start_pos_x) * buf_attrs.stride_x);
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        vec2 left_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, 0, gidH));
+        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, -(i + 1), gidH);
+            int  pos    = BORDER_SIZE_LEFT - i - 1;
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    set_replicate(offset, pos, left_val);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, left_val.xx);
+                }
+            }
+        }
+        // Handle right border
+        vec2 right_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, int(width) + i, gidH);
+            int  pos    = i + BORDER_SIZE_LEFT + int(width);
+
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, right_val.yy);
+                }
+                else
+                {
+                    set_replicate(offset, pos, right_val);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, right_val.yy);
+                }
+            }
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || (gidW > (int(width) - 1)))
+        {
+            val_idx = gidW < 0 ? 0 : (int(width) - 1);
+        }
+
+        // Handle top border
+        vec2 top_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, 0));
+        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, gidW, -(i + 1));
+
+            if(gid0 % 2 == 0)
+            {
+                if(gidW == (int(width) - 1))
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
+                }
+                else
+                {
+                    if(gidW < 0)
+                    {
+                        if(BORDER_SIZE_LEFT % 2 == 0)
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
+                        }
+                        else
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
+                        }
+                    }
+                    else if(gidW >= int(width))
+                    {
+                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
+                        }
+                    }
+                    else
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, top_val);
+                    }
+                }
+            }
+        }
+        // Handle bottom border
+        vec2 bottom_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, int(height) - 1));
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, gidW, int(height) + i);
+
+            if(gid0 % 2 == 0)
+            {
+                if(gidW == (int(width) - 1))
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
+                }
+                else
+                {
+                    if(gidW < 0)
+                    {
+                        if(BORDER_SIZE_LEFT % 2 == 0)
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
+                        }
+                        else
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.yy);
+                        }
+                    }
+                    else if(gidW >= int(width))
+                    {
+                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.yy);
+                        }
+                    }
+                    else
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, bottom_val);
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif /* DATA_TYPE_FP32 */
+
+#endif /* FILL_IMAGE_BORDERS_REPLICATE */
+
+#ifdef FILL_IMAGE_BORDERS_CONSTANT
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr        Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  buf_attrs      The attributes of the source image
+ * @param[in]  width          Width of the valid region of the image
+ * @param[in]  height         Height of the valid region of the image
+ * @param[in]  start_pos_x    X coordinate indicating the start point of the valid region
+ * @param[in]  start_pos_y    Y coordinate indicating the start point of the valid region
+ * @param[in]  constant_value Constant value to use to fill the edges
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes buf_attrs;
+    uint               width;
+    uint               height;
+    int                start_pos_x;
+    int                start_pos_y;
+    float              constant_value;
+};
+
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, bufBuffer, float, buf_ptr, buf_shift, 2, writeonly);
+
+void main()
+{
+    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
+
+    // Update pointer to point to the starting point of the valid region
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, -(i + 1), gidH), constant_value);
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) + i, gidH), constant_value);
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, -(i + 1)), constant_value);
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, int(height) + i), constant_value);
+        }
+    }
+}
+
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, bufBuffer, uint, buf_ptr, buf_shift, 2, restrict);
+
+void set_constant(uint offset, int pos)
+{
+    vec2 b = LOAD_UNPACK2_HALF(buf_ptr, offset);
+
+    if(pos % 2 == 0)
+    {
+        b.x = constant_value;
+    }
+    else
+    {
+        b.y = constant_value;
+    }
+
+    STORE_PACK2_HALF(buf_ptr, offset, b);
+}
+
+void main()
+{
+    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
+
+    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
+    int gid0        = int(gl_GlobalInvocationID.x);
+    int gidH        = gid0 - total_width;
+    int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    // Update pointer to point to the starting point of the valid region
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
+
+    vec2 b = vec2(constant_value, constant_value);
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, -(i + 1), gidH);
+            int  pos    = BORDER_SIZE_LEFT - i - 1;
+
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    set_constant(offset, pos);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, b);
+                }
+            }
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, int(width) + i, gidH);
+            int  pos    = i + BORDER_SIZE_LEFT + int(width);
+
+            if(i == 0)
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, b);
+                }
+                else
+                {
+                    set_constant(offset, pos);
+                }
+            }
+            else
+            {
+                if(pos % 2 == 0)
+                {
+                    STORE_PACK2_HALF(buf_ptr, offset, b);
+                }
+            }
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, gidW, -(i + 1));
+
+            if(gid0 % 2 == 0)
+            {
+                STORE_PACK2_HALF(buf_ptr, offset, b);
+            }
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            uint offset = IMAGE_OFFSET(buf_iter, gidW, int(height) + i);
+
+            if(gid0 % 2 == 0)
+            {
+                STORE_PACK2_HALF(buf_ptr, offset, b);
+            }
+        }
+    }
+}
+
+#endif /* DATA_TYPE_FP32 */
+
+#endif /* FILL_IMAGE_BORDERS_CONSTANT */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
new file mode 100644
index 0000000..8cf95af
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+#define LOAD8(r, name, offset) \
+    r.x = LOAD4(name, offset); \
+    r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2)); \
+    r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r)         \
+    STORE4(name, offset, r.x);           \
+    STORE4(name, offset + uint(1), r.y); \
+    STORE4(name, offset + uint(2), r.z); \
+    STORE4(name, offset + uint(3), r.w)
+
+#ifdef GEMM_TRANSPOSE1xW
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute address for Matrix B - source */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2;
+    vec4 b0;
+    LOAD16(b0, src, offset(src, 0, 0));
+    STORE16(dst, dst_addr_in_bytes, b0);
+}
+#endif /* GEMM_TRANSPOSE1xW */
+
+#ifdef GEMM_INTERLEAVE4x4
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGLES kernel reshapes the input matrix interleaving the values
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int i;
+    int j;
+
+    for(i = 0; i < 4; ++i)
+    {
+        for(j = 0; j < 4; ++j)
+        {
+            float res    = LOAD4(src, offset(src, i, j));
+            uint  ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j);
+            STORE4(dst, ofset0, res);
+        }
+    }
+}
+#endif /* GEMM_INTERLEAVE4x4 */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+BUFFER_DECLARATION(accum, 1, float, restrict);
+BUFFER_DECLARATION(biases, 2, float, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+    for(int i = 0; i < 16; ++i)
+    {
+        float accum_value  = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i));
+        float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i));
+        accum_value        = biases_value + accum_value;
+
+        // Store result in the accummulate buffer
+        STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value);
+    }
+}
+#endif /* GEMM_ACCUMULATE_BIASES */
+
+#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Compute address for matrix A and B */
+    src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);
+    src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);
+
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);
+
+    /* Reset accumulators */
+    vec4 c00 = vec4(0.0f);
+    vec4 c10 = vec4(0.0f);
+    vec4 c20 = vec4(0.0f);
+    vec4 c30 = vec4(0.0f);
+
+    for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0;
+        vec4 b0;
+        LOAD16(a0, src0, src0.current_offset);
+        LOAD16(b0, src1, src1.current_offset);
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        LOAD16(a0, src0, src0.current_offset + uint(4));
+        LOAD16(b0, src1, src1.current_offset + uint(4));
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+    }
+
+    for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0;
+        vec4 b0;
+        LOAD16(a0, src0, src0.current_offset);
+        LOAD16(b0, src1, src1.current_offset);
+
+        c00 += vec4(a0.x) * b0;
+        c10 += vec4(a0.y) * b0;
+        c20 += vec4(a0.z) * b0;
+        c30 += vec4(a0.w) * b0;
+    }
+
+    /* Multiply by the weight of matrix product */
+    c00 = c00 * vec4(ALPHA);
+    c10 = c10 * vec4(ALPHA);
+    c20 = c20 * vec4(ALPHA);
+    c30 = c30 * vec4(ALPHA);
+
+    /* Store 4x4 block */
+    STORE16(dst, offset(dst, 0, 0), c00);
+    STORE16(dst, offset(dst, 0, 1), c10);
+    STORE16(dst, offset(dst, 0, 2), c20);
+    STORE16(dst, offset(dst, 0, 3), c30);
+}
+#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
+
+#ifdef GEMM_MM_FLOATING_POINT
+BUFFER_DECLARATION(src0, 1, float, readonly);
+BUFFER_DECLARATION(src1, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);
+    src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);
+
+    /* Compute end row address for matrix A */
+    int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))
+    {
+        vec2 a0;
+        LOAD8(a0, src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+        LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+        LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+        LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        vec4 b0;
+        vec4 b1;
+        LOAD16(b0, src1, src1.current_offset);
+        LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));
+
+        acc0 += b0 * vec4(a0.x);
+        acc0 += b1 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+        acc1 += b1 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+        acc2 += b1 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+        acc3 += b1 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))
+    {
+        // Load values from matrix A
+        float a0;
+        a0 = LOAD4(src0, src0.current_offset);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1;
+        a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2;
+        a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3;
+        a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        vec4 b0;
+        LOAD16(b0, src1, src1.current_offset);
+
+        acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 0), acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = acc1 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = acc2 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = acc3 * vec4(ALPHA);
+    STORE16(dst, offset(dst, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_MATRIXADDITION
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, restrict);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @attention The beta's value need to be passed at compile time using BETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+void main(void)
+{
+    /* Compute source and destination addresses */
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    /* Load values from A x B */
+    vec4 alpha_ab;
+    vec4 c;
+    vec4 out1;
+
+    LOAD16(alpha_ab, dst, dst.current_offset);
+    LOAD16(c, src, src.current_offset);
+
+    /* Computes alpha * axb + beta * c */
+    out1 = alpha_ab + vec4(BETA * c);
+
+    /* Store final result in axb matrix */
+    STORE16(dst, dst.current_offset, out1);
+}
+#endif /* GEMM_MATRIXADDITION */
+#elif defined(DATA_TYPE_FP16)
+precision mediump float;
+#ifdef GEMM_MM_FLOATING_POINT
+#if defined(MM_PROCESS_4X)
+BUFFER_DECLARATION(src0, 1, uint, readonly);
+BUFFER_DECLARATION(src1, 2, uvec2, readonly);
+BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
+    {
+        uint packed_a;
+        vec2 a0;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0 = vec2(unpackHalf2x16(packed_a));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3 = vec2(unpackHalf2x16(packed_a));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b0;
+        uvec2 packed_b1;
+        vec4  b0;
+        vec4  b1;
+
+        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+        GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);
+
+        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+        b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));
+
+        acc0 += b0 * vec4(a0.x);
+        acc0 += b1 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+        acc1 += b1 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+        acc2 += b1 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+        acc3 += b1 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
+    {
+        uint packed_a0;
+        vec2 a0;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
+        a0 = vec2(unpackHalf2x16(packed_a0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec2 a1;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 1);
+        a1 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec2 a2;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 2);
+        a2 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec2 a3;
+
+        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 3);
+        a3 = vec2(unpackHalf2x16(packed_a0));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b0;
+        vec4  b0;
+
+        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
+
+        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+
+        acc0 += b0 * (a0.x);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * (a1.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * (a2.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * (a3.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+
+    uvec2 packed_d;
+    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+#endif                                 // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* PROCESS_4X */
+BUFFER_DECLARATION(src0, 1, uvec4, readonly);
+BUFFER_DECLARATION(src1, 2, uvec2, readonly);
+BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a0[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b;
+        vec4  b;
+
+        for(int i = 0; i < 8; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+
+            acc0 += b * vec4(a0[j][k]);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+            acc1 += b * vec4(a1[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+            acc2 += b * vec4(a2[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+            acc3 += b * vec4(a3[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        }
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a0[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
+        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
+        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
+        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        uvec2 packed_b;
+        vec4  b;
+
+        int leftover = COLS_A % 8;
+
+        for(int i = 0; i < leftover; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+
+            acc0 += b * vec4(a0[j][k]);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+            acc1 += b * vec4(a1[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+            acc2 += b * vec4(a2[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+            acc3 += b * vec4(a3[j][k]);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        }
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+
+    uvec2 packed_d;
+    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+#endif                       // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#elif defined(MM_PROCESS_8X) /* PROCESS_4X */
+BUFFER_DECLARATION(src0, 1, uvec4, readonly);
+BUFFER_DECLARATION(src1, 2, uvec4, readonly);
+BUFFER_DECLARATION(dst, 3, uvec4, writeonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src0);
+    IMAGE_PARAM_DECLARATION(src1);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ */
+void main()
+{
+    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
+    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
+    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+
+    /* Compute end row address for matrix A */
+    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+
+    /* Reset accumulators */
+    vec4 acc[2];
+
+    acc[0] = vec4(0.0f);
+    acc[1] = vec4(0.0f);
+
+    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+        uvec4 packed_b;
+        vec4  b[2];
+
+        for(int i = 0; i < 8; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+
+            acc[0] += b[0] * vec4(a[j][k]);
+            acc[1] += b[1] * vec4(a[j][k]);
+        }
+    }
+
+    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    {
+        uvec4 packed_a;
+        vec4  a[2];
+
+        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
+        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
+        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+
+        uvec4 packed_b;
+        vec4  b[2];
+
+        int leftover = COLS_A % 8;
+
+        for(int i = 0; i < leftover; i++)
+        {
+            int j = i >> 2;
+            int k = i % 4;
+
+            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
+
+            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+
+            acc[0] += b[0] * vec4(a[j][k]);
+            acc[1] += b[1] * vec4(a[j][k]);
+        }
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc[0] = acc[0] * vec4(ALPHA);
+    acc[1] = acc[1] * vec4(ALPHA);
+
+    uvec4 packed_d;
+    packed_d = uvec4(packHalf2x16(acc[0].xy), packHalf2x16(acc[0].zw), packHalf2x16(acc[1].xy), packHalf2x16(acc[1].zw));
+    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+}
+#endif                       /* PROCESS_4X */
+#endif                       /* GEMM_MM_FLOATING_POINT */
+
+#ifdef GEMM_ACCUMULATE_BIASES
+#if defined(ACCUM_PROCESS_4X)
+BUFFER_DECLARATION(accum, 1, uvec2, restrict);
+BUFFER_DECLARATION(biases, 2, uvec2, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+
+    vec4  u[2];
+    uvec2 packed_s[2];
+    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
+    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+
+    vec4 tmp;
+    tmp         = u[0] + u[1];
+    packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
+    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+}
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+BUFFER_DECLARATION(accum, 1, uvec4, restrict);
+BUFFER_DECLARATION(biases, 2, uvec4, readonly);
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(accum);
+    VECTOR_PARAM_DECLARATION(biases);
+};
+
+/** This kernel accumulates each row with the biases vector
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
+    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+
+    vec4  u[2];
+    vec4  v[2];
+    uvec4 packed_s[2];
+    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
+
+    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
+
+    v[0] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    v[1] = vec4(unpackHalf2x16(packed_s[1].z), unpackHalf2x16(packed_s[1].w));
+
+    vec4 r[2];
+    r[0]        = u[0] + v[0];
+    r[1]        = u[1] + v[1];
+    packed_s[0] = uvec4(packHalf2x16(r[0].xy), packHalf2x16(r[0].zw), packHalf2x16(r[1].xy), packHalf2x16(r[1].zw));
+    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+}
+#endif                          /* ACCUM_PROCESS_4X */
+#endif                          /* GEMM_ACCUMULATE_BIASES */
+#else                           /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
new file mode 100644
index 0000000..62c58d5
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h

@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_H
+#define ARM_COMPUTE_HELPER_H
+
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT(x, type) type(x)
+
+#define PACK(value, stype, dtype) \
+    pack_##stype##_##dtype(value)
+
+#define UNPACK(value, stype, dtype) \
+    unpack_##stype##_##dtype(value)
+
+#define BUFFER_DECLARATION(name, location, type, access)          \
+    layout(std430, binding = location) access buffer name##Buffer \
+    {                                                             \
+        type name##_ptr[];                                        \
+    }
+
+#define VECTOR_PARAM_DECLARATION(name)         \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size
+
+#define IMAGE_PARAM_DECLARATION(name)          \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_stride_y;                      \
+    uint name##_step_y;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size;         \
+    uint name##_padding1;                      \
+    uint name##_padding2
+
+#define TENSOR3D_PARAM_DECLARATION(name)       \
+    uint name##_stride_x;                      \
+    uint name##_step_x;                        \
+    uint name##_stride_y;                      \
+    uint name##_step_y;                        \
+    uint name##_stride_z;                      \
+    uint name##_step_z;                        \
+    uint name##_offset_first_element_in_bytes; \
+    uint name##_buffer_data_type_size
+
+/** Structure to hold Vector information */
+struct Vector
+{
+    uint current_offset;                /**< Current offset of vector */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+};
+
+/** Structure to hold Image information */
+struct Image
+{
+    uint current_offset;                /**< Current offset of image */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+};
+
+/** Structure to hold 3D tensor information */
+struct Tensor3D
+{
+    uint current_offset;                /**< Current offset of tensor */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    uint stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
+};
+
+/////////////////////////////////////////////////////////////
+
+#define CONVERT_TO_VECTOR_STRUCT(name) \
+    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_FP16(name) \
+    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(name) \
+    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_FP16(name) \
+    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
+    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_FP16(name) \
+    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
+    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                    name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_FP16(name)                                                                                                  \
+    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                         name##_stride_z, name##_step_z)
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(name) \
+    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define LOAD4(name, offset) \
+    name##_ptr[offset]
+
+#define STORE4(name, offset, value) \
+    name##_ptr[offset] = value
+
+// Load 1 element, which size is determined by ssbo type.
+#define LOAD1(r, name, offset) \
+    r = name##_ptr[offset]
+
+#define STORE1(name, offset, value) \
+    name##_ptr[offset] = value
+
+#define LOAD2(r, name, offset) \
+    LOAD1(r[0], name, offset); \
+    LOAD1(r[1], name, (offset) + uint(1))
+
+#define STORE2(name, offset, value)            \
+    name##_ptr[offset]             = value[0]; \
+    name##_ptr[(offset) + uint(1)] = value[1]
+
+#define LOAD3(r, name, offset)             \
+    LOAD1(r[0], name, offset);             \
+    LOAD1(r[1], name, (offset) + uint(1)); \
+    LOAD1(r[2], name, (offset) + uint(2))
+
+#define CURRENT_OFFSET(name) \
+    name.current_offset
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = (vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x) >> 2;
+
+    return vector;
+}
+
+/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ *
+ * @return An vector object
+ */
+Vector update_vector_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+    return vector;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y) >> 2;
+
+    return img;
+}
+
+/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ *
+ * @return An image object
+ */
+Image update_image_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 2D Image object
+ */
+Image update_image_from_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return img;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = (tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
+
+    return tensor;
+}
+
+/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Tensor3D update_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return tensor;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+uint vector_offset(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset << 2, int) + x * CONVERT(vec.stride_x, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Vector
+ *
+ * @param[in] vec Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ */
+uint vector_offset_fp16(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+uint offset(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset << 2, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Image
+ *
+ * @param[in] img Pointer to the starting position of the buffer
+ * @param[in] x   Relative X position
+ * @param[in] y   Relative Y position
+ */
+uint offset_fp16(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+uint tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset << 2, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint) >> 2;
+}
+
+/** Get the pointer position of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting postion of the buffer
+ * @param[in] x      Relative X position
+ * @param[in] y      Relative Y position
+ * @param[in] z      Relative Z position
+ */
+uint tensor3D_offset_fp16(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+/////////////////////////////////////////////////////////////
+// new one
+
+#define GC_CONVERT_TO_VECTOR_STRUCT(name) \
+    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
+
+#define GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
+    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
+
+#define GC_CONVERT_TO_IMAGE_STRUCT(name) \
+    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define GC_CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
+    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                                       name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
+    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
+    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+
+#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
+
+Vector gc_update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    Vector vector;
+    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    vector.stride_x                      = stride_x;
+    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
+
+    return vector;
+}
+
+Image gc_update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
+
+    return img;
+}
+
+Tensor3D gc_update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor;
+    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    tensor.stride_x                      = stride_x;
+    tensor.stride_y                      = stride_y;
+    tensor.stride_z                      = stride_z;
+    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return tensor;
+}
+
+Image gc_update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Image img;
+    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
+    img.stride_x                      = stride_x;
+    img.stride_y                      = stride_y;
+    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
+
+    return img;
+}
+
+#define GC_CURRENT_OFFSET(name) \
+    name.current_offset
+
+uint gc_vector_offset(Vector vec, int x)
+{
+    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
+}
+
+uint gc_image_offset(Image img, int x, int y)
+{
+    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
+}
+
+uint gc_tensor3D_offset(Tensor3D tensor, int x, int y, int z)
+{
+    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
+}
+
+// load/store number of element depends on buffer type
+#define GC_LOAD1(r, name, offset) \
+    r = name##_ptr[offset]
+
+#define GC_LOAD2(r, name, offset) \
+    GC_LOAD1(r[0], name, offset); \
+    GC_LOAD1(r[1], name, (offset) + uint(1))
+
+#define GC_LOAD3(r, name, offset)             \
+    GC_LOAD1(r[0], name, offset);             \
+    GC_LOAD1(r[1], name, (offset) + uint(1)); \
+    GC_LOAD1(r[2], name, (offset) + uint(2))
+
+#define GC_STORE1(value, name, offset) \
+    name##_ptr[offset] = value
+
+#define GC_STORE2(value, name, offset) \
+    GC_STORE1(value[0], name, offset); \
+    GC_STORE1(value[1], name, (offset) + uint(1))
+
+#define GC_STORE3(value, name, offset)             \
+    GC_STORE1(value[0], name, offset);             \
+    GC_STORE1(value[1], name, (offset) + uint(1)); \
+    GC_STORE1(value[2], name, (offset) + uint(2))
+
+// has to manually expand them since not supported by compiler
+#define GC_LOAD1_1D_OFFSET(r, name, x) \
+    GC_LOAD1(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_2D_OFFSET(r, name, x, y) \
+    GC_LOAD1(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD1_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD1(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_1D_OFFSET(value, name, x) \
+    GC_STORE1(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_2D_OFFSET(value, name, x, y) \
+    GC_STORE1(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE1_3D_OFFSET(value, name, x, y, z) \
+    GC_STORE1(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_1D_OFFSET(r, name, x) \
+    GC_LOAD2(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_2D_OFFSET(r, name, x, y) \
+    GC_LOAD2(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD2_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD2(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_1D_OFFSET(value, name, x) \
+    GC_STORE2(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_2D_OFFSET(value, name, x, y) \
+    GC_STORE2(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_STORE2_3D_OFFSET(value, name, x, y, z) \
+    GC_STORE2(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_1D_OFFSET(r, name, x) \
+    GC_LOAD3(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_2D_OFFSET(r, name, x, y) \
+    GC_LOAD3(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
+
+#define GC_LOAD3_3D_OFFSET(r, name, x, y, z) \
+    GC_LOAD3(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
+
+/////////////////////////////////////////////////////////////
+
+#endif // _HELPER_H

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
new file mode 100755
index 0000000..404b46a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h

@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_CS_H
+#define ARM_COMPUTE_HELPER_CS_H
+
+#define SHADER_PARAMS_DECLARATION \
+    layout(std140, binding = 0) uniform shader_params
+
+#define TENSOR_DECLARATION(location, buffer_type, type, ptr_name, shift_name, element_shift, access) \
+    layout(std430, binding = location) access buffer buffer_type                                     \
+    {                                                                                                \
+        type ptr_name[];                                                                             \
+    };                                                                                               \
+    const uint shift_name = uint(element_shift)
+
+struct VectorAttributes
+{
+    uint stride_x;                      /**< Stride of the vector in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the vector (in bytes) */
+    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct ImageAttributes
+{
+    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the image (in bytes) */
+    uint padding1;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+    uint padding2;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+    uint padding3;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct Tensor3DAttributes
+{
+    uint stride_x;                      /**< Stride of the tensor in X dimension (in bytes) */
+    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
+    uint stride_y;                      /**< Stride of the tensor in Y dimension (in bytes) */
+    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+    uint stride_z;                      /**< Stride of the tensor in Z dimension (in bytes) */
+    uint step_z;                        /**< stride_z * number of elements along Z processed per workitem (in bytes) */
+    uint offset_first_element_in_bytes; /**< The offset of the first element in the tensor (in bytes) */
+    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct VectorIterator
+{
+    int current_offset_in_bytes; /**< Current offset of vector (in bytes) */
+    int stride_x;                /**< Stride of the vector in X dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+struct ImageIterator
+{
+    int current_offset_in_bytes; /**< Current offset of image (in bytes) */
+    int stride_x;                /**< Stride of the image in X dimension (in bytes) */
+    int stride_y;                /**< Stride of the image in Y dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+struct Tensor3DIterator
+{
+    int current_offset_in_bytes; /**< Current offset of tensor (in bytes) */
+    int stride_x;                /**< Stride of the tensor in X dimension (in bytes) */
+    int stride_y;                /**< Stride of the tensor in Y dimension (in bytes) */
+    int stride_z;                /**< Stride of the tensor in Z dimension (in bytes) */
+    int element_shift;           /**< The number of bits to shift by for one element */
+};
+
+#define CONVERT_TO_VECTOR_ITERATOR(attrs, element_shift)                          \
+    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                              attrs.stride_x, attrs.step_x)
+
+#define CONVERT_TO_VECTOR_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                              attrs.stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_ITERATOR(attrs, element_shift)                          \
+    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                             attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y)
+
+#define CONVERT_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                             attrs.stride_x, uint(0), attrs.stride_y, uint(0))
+
+#define CONVERT_TO_TENSOR3D_ITERATOR(attrs, element_shift)                          \
+    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(attrs, element_shift)                  \
+    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(attrs, element_shift)                               \
+    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                           attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                       \
+    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+                                           attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, attrs.step_z)
+
+/** Wrap vector information into a VectorIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ *
+ * @return A VectorIterator object
+ */
+VectorIterator update_vector_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+    VectorIterator vector_iter;
+    vector_iter.element_shift           = int(element_shift);
+    vector_iter.stride_x                = int(stride_x);
+    vector_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x);
+
+    return vector_iter;
+}
+
+/** Wrap image information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    ImageIterator image_iter;
+    image_iter.element_shift           = int(element_shift);
+    image_iter.stride_x                = int(stride_x);
+    image_iter.stride_y                = int(stride_y);
+    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y);
+
+    return image_iter;
+}
+
+/** Wrap 3D tensor information into a Tensor3DIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tersor
+ * @param[in] stride_x                      Stride of the tersor in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the tersor in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z                      Stride of the tersor in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return A 3D Tensor3DIterator object
+ */
+Tensor3DIterator update_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3DIterator tensor_iter;
+    tensor_iter.element_shift           = int(element_shift);
+    tensor_iter.stride_x                = int(stride_x);
+    tensor_iter.stride_y                = int(stride_y);
+    tensor_iter.stride_z                = int(stride_z);
+    tensor_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+    return tensor_iter;
+}
+
+/** Wrap 3D tensor information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift                 The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] stride_x                      Stride of the tensor in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y                      Stride of the tensor in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z                      Stride of the tensor in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_from_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    ImageIterator image_iter;
+    image_iter.element_shift           = int(element_shift);
+    image_iter.stride_x                = int(stride_x);
+    image_iter.stride_y                = int(stride_y);
+    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+    return image_iter;
+}
+
+#define VECTOR_OFFSET(tensor_iter, x) \
+    uint(vector_offset_in_bytes(tensor_iter, int(x)) >> tensor_iter.element_shift)
+
+#define IMAGE_OFFSET(tensor_iter, x, y) \
+    uint(image_offset_in_bytes(tensor_iter, int(x), int(y)) >> tensor_iter.element_shift)
+
+#define TENSOR3D_OFFSET(tensor_iter, x, y, z) \
+    uint(tensor3D_offset_in_bytes(tensor_iter, int(x), int(y), int(z)) >> tensor_iter.element_shift)
+
+#define TENSOR_OFFSET_ADVANCE_IN_BYTES(tensor_iter, n) \
+    uint((tensor_iter.current_offset_in_bytes + int(n)) >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET(tensor_iter) \
+    uint(tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET_IN_BYTES(tensor_iter) \
+    uint(tensor_iter.current_offset_in_bytes)
+
+#define TENSOR_ITERATOR_ADVANCE_IN_BYTES(tensor_iter, n) \
+    tensor_iter.current_offset_in_bytes += int(n)
+
+/** Get the offset of a VectorIterator
+ *
+ * @param[in] vector_iter The VectorIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ *
+ * @return The relative offset of the VectorIterator object (in bytes)
+ */
+uint vector_offset_in_bytes(VectorIterator vector_iter, int x)
+{
+    return uint(vector_iter.current_offset_in_bytes + x * vector_iter.stride_x);
+}
+
+/** Get the offset of an ImageIterator
+ *
+ * @param[in] vector_iter The ImageIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ * @param[in] y           Relative Y position
+ *
+ * @return The relative offset of the ImageIterator object (in bytes)
+ */
+uint image_offset_in_bytes(ImageIterator image_iter, int x, int y)
+{
+    return uint(image_iter.current_offset_in_bytes + x * image_iter.stride_x + y * image_iter.stride_y);
+}
+
+/** Get the offset of a Tensor3DIterator
+ *
+ * @param[in] vector_iter The Tensor3DIterator object pointed to the starting position of the buffer
+ * @param[in] x           Relative X position
+ * @param[in] y           Relative Y position
+ * @param[in] z           Relative Z position
+ *
+ * @return The relative offset of the Tensor3DIterator object (in bytes)
+ */
+uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
+{
+    return uint(tensor_iter.current_offset_in_bytes + x * tensor_iter.stride_x + y * tensor_iter.stride_y + z * tensor_iter.stride_z);
+}
+
+#define LOAD(tensor_ptr, offset) tensor_ptr[offset]
+#define STORE(tensor_ptr, offset, data) tensor_ptr[offset] = data
+#define LOAD_CURRENT_ITEM(tensor_ptr, tensor_iter) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)]
+#define STORE_CURRENT_ITEM(tensor_ptr, tensor_iter, data) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)] = data
+
+#define VLOAD2(return_type, tensor_ptr, offset) \
+    return_type(LOAD(tensor_ptr, offset),       \
+                LOAD(tensor_ptr, (offset) + uint(1)))
+
+#define VSTORE2(tensor_ptr, offset, data) \
+    STORE(tensor_ptr, offset, data[0]);   \
+    STORE(tensor_ptr, (offset) + uint(1), data[1])
+
+#define VLOAD2_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD2(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE2(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD3(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)))
+
+#define VSTORE3(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2])
+
+#define VLOAD3_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD3(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE3_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE3(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)), \
+                LOAD(tensor_ptr, (offset) + uint(3)))
+
+#define VSTORE4(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+    STORE(tensor_ptr, (offset) + uint(3), data[3])
+
+#define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+/** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
+ *
+ * @param[in] data The vec4 object to be packed
+ *
+ * @return The packed uvec2 object
+ */
+highp uvec2 pack4_half(mediump vec4 data)
+{
+    return uvec2(packHalf2x16(data.xy), packHalf2x16(data.zw));
+}
+
+/** Unpacking the uvec2 object to 4 half-precision (16-bits) floating point values and converting to a vec4 object
+ *
+ * @param[in] packed_data The uvec2 object to be unpacked
+ *
+ * @return The unpacked vec4 object
+ */
+mediump vec4 unpack4_half(highp uvec2 packed_data)
+{
+    return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
+}
+
+/** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
+ *
+ * @param[in] data The vec4[2] object to be packed
+ *
+ * @return The packed uvec4 object
+ */
+highp uvec4 pack8_half(mediump vec4 data[2])
+{
+    return uvec4(packHalf2x16(data[0].xy), packHalf2x16(data[0].zw),
+                 packHalf2x16(data[1].xy), packHalf2x16(data[1].zw));
+}
+
+/** Unpacking the uvec4 object to 8 half-precision (16-bits) floating point values and converting to a vec4[2] object
+ *
+ * @param[in] packed_data The uvec4 object to be unpacked
+ *
+ * @return The unpacked vec4[2] object
+ */
+mediump vec4[2] unpack8_half(highp uvec4 packed_data)
+{
+    return vec4[2](vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y)),
+                   vec4(unpackHalf2x16(packed_data.z), unpackHalf2x16(packed_data.w)));
+}
+
+// For half-precision (16-bits) floating point packed into a "uint" element
+#define LOAD_UNPACK2_HALF(tensor_ptr, offset) unpackHalf2x16(uint(LOAD(tensor_ptr, offset)))
+#define STORE_PACK2_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, packHalf2x16(data))
+#define LOAD_UNPACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(VLOAD2(uvec2, tensor_ptr, offset))
+#define VSTORE2_PACK4_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack4_half(data))
+#define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
+#define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
+#define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE4_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec2" element
+#define LOAD_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(uvec2(LOAD(tensor_ptr, offset)))
+#define STORE_PACK4_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_half(data))
+#define LOAD_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD2(uvec4, tensor_ptr, offset))
+#define VSTORE2_PACK8_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack8_half(data))
+#define VLOAD2_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec4" element
+#define LOAD_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(uvec4(LOAD(tensor_ptr, offset)))
+#define STORE_PACK8_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack8_half(data))
+#define LOAD_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+/** Converting the uvec4 object to 4 low-precision uint values and packing into a uint object
+ *
+ * @param[in] data The uvec4 object to be packed
+ *
+ * @return The packed uint object
+ */
+highp uint pack4_u8(lowp uvec4 data)
+{
+    highp uint r = uint(0);
+
+    for(int i = 0; i < 4; i++)
+    {
+        r |= data[i] << uint(i * 8);
+    }
+
+    return r;
+}
+
+/** Unpacking the uint object to 4 low-precision uint values and converting to a uvec4 object
+ *
+ * @param[in] packed_data The uint object to be unpacked
+ *
+ * @return The unpacked uvec4 object
+ */
+lowp uvec4 unpack4_u8(highp uint packed_data)
+{
+    lowp uvec4 uvec;
+
+    for(int i = 0; i < 4; i++)
+    {
+        uvec[i] = (packed_data >> uint(i * 8)) & uint(0xFF);
+    }
+
+    return uvec;
+}
+
+#define LOAD_UNPACK4_U8(tensor_ptr, offset) unpack4_u8(uint(LOAD(tensor_ptr, offset)))
+#define STORE_PACK4_U8(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_u8(data))
+#define LOAD_UNPACK4_CURRENT_ITEM_U8(tensor_ptr, tensor_iter) LOAD_UNPACK4_U8(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK4_CURRENT_ITEM_U8(tensor_ptr, tensor_iter, data) STORE_PACK4_U8(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#endif // ARM_COMPUTE_HELPER_CS_H

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
new file mode 100755
index 0000000..166953f
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs

@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src1);
+    TENSOR3D_PARAM_DECLARATION(src2);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+#ifdef CROSS_MAP
+/** Apply cross map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float acc = 0.0;
+
+    int num_of_slices = int(gl_NumWorkGroups.z * gl_WorkGroupSize.z);
+    int current_slice = int(gl_GlobalInvocationID.z);
+
+    int left_slice  = max(current_slice - int(RADIUS), int(0));
+    int right_slice = min(current_slice + int(RADIUS), int(num_of_slices - 1));
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        acc += src2_ptr[tensor3D_offset(src2, 0, 0, i - current_slice)];
+    }
+
+    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+    dst_ptr[dst.current_offset] = normalized_pixel;
+}
+
+#elif defined(IN_MAP_1D)
+/** Apply in map normalization.
+ *
+ * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
+ * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
+ * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
+ * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
+ *
+ * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+void main(void)
+{
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float acc = 0.0;
+
+    int num_of_items_x = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);
+    int current_pos    = int(gl_GlobalInvocationID.x);
+
+    int left_pos  = max(current_pos - int(RADIUS), int(0));
+    int right_pos = min(current_pos + int(RADIUS), int(num_of_items_x + -1));
+
+    for(int i = left_pos; i <= right_pos; i++)
+    {
+        acc += src2_ptr[tensor3D_offset(src2, i - current_pos, 0, 0)];
+    }
+
+    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
+
+    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+
+    dst_ptr[dst.current_offset] = normalized_pixel;
+}
+#endif /*CROSS_MAP*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
new file mode 100644
index 0000000..031687a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src1);
+    TENSOR3D_PARAM_DECLARATION(src2);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+BUFFER_DECLARATION(src1, 1, float, readonly);
+BUFFER_DECLARATION(src2, 2, float, readonly);
+BUFFER_DECLARATION(dst, 3, float, writeonly);
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @param[in]  src1_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src1_step_z                        src1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  src2_ptr                           Pointer to the source image. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src2_step_z                        src2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  scale                              Float scaling factor. Supported data types: F32
+ */
+void main()
+{
+    // Get pixels pointer
+    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
+    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    dst_ptr[dst.current_offset] = (src1_ptr[src1.current_offset] * src2_ptr[src2.current_offset] * float(SCALE));
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
new file mode 100644
index 0000000..401b002
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs

@@ -0,0 +1,1444 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD8(r, name, offset) \
+    r.x = LOAD4(name, offset); \
+    r.y = LOAD4(name, offset + uint(1))
+
+#define LOAD16(r, name, offset)          \
+    r.x = LOAD4(name, offset);           \
+    r.y = LOAD4(name, offset + uint(1)); \
+    r.z = LOAD4(name, offset + uint(2)); \
+    r.w = LOAD4(name, offset + uint(3))
+
+#define STORE16(name, offset, r)         \
+    STORE4(name, offset, r.x);           \
+    STORE4(name, offset + uint(1), r.y); \
+    STORE4(name, offset + uint(2), r.z); \
+    STORE4(name, offset + uint(3), r.w)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b)        \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }                             \
+    if(isnan(a.z) || (a.z < b.z)) \
+    {                             \
+        res.z = b.z;              \
+    }                             \
+    if(isnan(a.w) || (a.w < b.w)) \
+    {                             \
+        res.w = b.w;              \
+    }
+#define POOL_OP_float(res, a, b) \
+    (res) = (a);                 \
+    if(isnan(a) || (a < b))      \
+    {                            \
+        res = b;                 \
+    }
+#define POOL_OP_vec2(res, a, b)   \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -3.402823466385289e+38
+#endif // POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                     \
+    vec4 data00;                                                                                                   \
+    vec2 data01;                                                                                                   \
+    vec4 data10;                                                                                                   \
+    vec2 data11;                                                                                                   \
+    vec4 data20;                                                                                                   \
+    vec2 data21;                                                                                                   \
+    LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0));                                                        \
+    LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                               \
+    LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0));                                                        \
+    LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                               \
+    LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0));                                                        \
+    LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                               \
+    data00 = POW2_OP(data00, 4);                                                                                   \
+    data01 = POW2_OP(data01, 2);                                                                                   \
+    data10 = POW2_OP(data10, 4);                                                                                   \
+    data11 = POW2_OP(data11, 2);                                                                                   \
+    data20 = POW2_OP(data20, 4);                                                                                   \
+    data21 = POW2_OP(data21, 2);                                                                                   \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data00.xyzy;                                                                                  \
+    values001.xyzw = data00.zwzw;                                                                                  \
+    values010.x    = data01.x;                                                                                     \
+    values010.y    = data00.w;                                                                                     \
+    values010.zw   = data01.xy;                                                                                    \
+    values100.xyzw = data10.xyzy;                                                                                  \
+    values101.xyzw = data10.zwzw;                                                                                  \
+    values11.x     = data11.x;                                                                                     \
+    values11.y     = data10.w;                                                                                     \
+    values11.zw    = data11.xy;                                                                                    \
+    values200.xyzw = data20.xyzy;                                                                                  \
+    values201.xyzw = data20.zwzw;                                                                                  \
+    values21.x     = data21.x;                                                                                     \
+    values21.y     = data20.w;                                                                                     \
+    values21.zw    = data21.xy;                                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                     \
+    vec4  data000;                                                                                                 \
+    vec4  data001;                                                                                                 \
+    float data010;                                                                                                 \
+    vec4  data100;                                                                                                 \
+    vec4  data101;                                                                                                 \
+    float data11;                                                                                                  \
+    vec4  data200;                                                                                                 \
+    vec4  data201;                                                                                                 \
+    float data21;                                                                                                  \
+    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                                       \
+    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                             \
+    data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                             \
+    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                                       \
+    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                             \
+    data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                              \
+    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                                       \
+    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                             \
+    data21  = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                             \
+    data000 = POW2_OP(data000, 4);                                                                                 \
+    data001 = POW2_OP(data001, 4);                                                                                 \
+    data010 = POW2_OP(data010, 1);                                                                                 \
+    data100 = POW2_OP(data100, 4);                                                                                 \
+    data101 = POW2_OP(data101, 4);                                                                                 \
+    data11  = POW2_OP(data11, 1);                                                                                  \
+    data200 = POW2_OP(data200, 4);                                                                                 \
+    data201 = POW2_OP(data201, 4);                                                                                 \
+    data21  = POW2_OP(data21, 1);                                                                                  \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data000.xyzz;                                                                                 \
+    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
+    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
+    values100.xyzw = data100.xyzz;                                                                                 \
+    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
+    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
+    values200.xyzw = data200.xyzz;                                                                                 \
+    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
+    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                         \
+    vec4 data000;                                                                                      \
+    vec4 data001;                                                                                      \
+    vec4 data010;                                                                                      \
+    vec4 data100;                                                                                      \
+    vec4 data101;                                                                                      \
+    vec4 data11;                                                                                       \
+    vec4 data200;                                                                                      \
+    vec4 data201;                                                                                      \
+    vec4 data21;                                                                                       \
+    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                           \
+    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                 \
+    LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                 \
+    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                           \
+    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                 \
+    LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                  \
+    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                           \
+    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                 \
+    LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                  \
+    data000 = POW2_OP(data000, 4);                                                                     \
+    data001 = POW2_OP(data001, 4);                                                                     \
+    data010 = POW2_OP(data010, 4);                                                                     \
+    data100 = POW2_OP(data100, 4);                                                                     \
+    data101 = POW2_OP(data101, 4);                                                                     \
+    data11  = POW2_OP(data11, 4);                                                                      \
+    data200 = POW2_OP(data200, 4);                                                                     \
+    data201 = POW2_OP(data201, 4);                                                                     \
+    data21  = POW2_OP(data21, 4);                                                                      \
+    \
+    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
+    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
+    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_max;
+    data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
+
+    for(int i = 0; (start_x + i) < end_x; ++i)
+    {
+        for(int j = 0; (start_y + j) < end_y; ++j)
+        {
+            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+            POOL_OP_float(data_max, data_max, data);
+        }
+    }
+
+    return data_max;
+}
+
+float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_total = 0.0f;
+    for(int i = 0; (start_x + i) < end_x; i++)
+    {
+        for(int j = 0; (start_y + j) < end_y; ++j)
+        {
+            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
+            if(isnan(data))
+            {
+                data = 0.0f;
+            }
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data = POW2_OP(data, 1);
+#endif /* defined(POOL_L2) */
+            data_total = data_total + data;
+        }
+    }
+
+    return data_total / float((end_y - start_y) * (end_x - start_x));
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    vec4 res;
+    // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+    POOLING3x3_STRIDE1(res, src, dst);
+#elif STRIDE_X == 2
+    POOLING3x3_STRIDE2(res, src, dst);
+#elif STRIDE_X == 3
+    POOLING3x3_STRIDE3(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+    // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+    int   end_y   = min((start_y + 3), MAX_HEIGHT);
+    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    STORE16(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F32;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    vec4 vdata0;
+    vdata0 = vec4(INITIAL_VALUE);
+    vec4 vdata1;
+    vdata1 = vec4(INITIAL_VALUE);
+    float sdata;
+    sdata = float(INITIAL_VALUE);
+
+    for(int y = 0; y < int(POOL_SIZE); y++)
+    {
+        int x = 0;
+        for(; x <= (int(POOL_SIZE) - 8); x += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
+            LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata0, vdata0, data2);
+            POOL_OP(vdata1, vdata1, data3);
+        }
+
+        // Leftover
+        for(; x < int(POOL_SIZE); ++x)
+        {
+            float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4 *= data4;
+#endif /* defined(POOL_L2) */
+            POOL_OP_float(sdata, sdata, data4);
+        }
+    }
+
+    //Reduce result
+    vec4 reduce4;
+    POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
+    vec2 reduce2;
+    POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
+    float res;
+    POOL_OP_float(res, reduce2.x, reduce2.y);
+    POOL_OP_float(res, res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    {
+        // Divide by pool region in case of average pooling
+        int   start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+        int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int   end_x   = int(min(start_x + POOL_SIZE, MAX_WIDTH));
+        int   end_y   = int(min(start_y + POOL_SIZE, MAX_HEIGHT));
+        float res1    = float((end_y - start_y) * (end_x - start_x));
+        res           = DIV_OP(res, res1);
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE4(dst, CURRENT_OFFSET(dst), res);
+}
+#endif /* POOLING_LAYER_2 */
+
+#elif defined(DATA_TYPE_FP16)
+
+precision mediump float;
+
+vec2 load_and_unpack(Tensor3D, uint);
+vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+
+BUFFER_DECLARATION(src, 1, uint, readonly);
+BUFFER_DECLARATION(dst, 2, uint, writeonly);
+
+layout(std140) uniform shader_params
+{
+    TENSOR3D_PARAM_DECLARATION(src);
+    TENSOR3D_PARAM_DECLARATION(dst);
+};
+
+#define LOAD2_fp16(r, name, offset) \
+    r.xy = load_and_unpack(name, offset)
+
+#define LOAD4_fp16(r, name, offset)       \
+    r.xy = load_and_unpack(name, offset); \
+    r.zw = load_and_unpack(name, offset + uint(1))
+
+#define STORE4_fp16(name, offset, r)             \
+    uint datastore1;                             \
+    uint datastore2;                             \
+    datastore1 = uint(packHalf2x16(r.xy));       \
+    datastore2 = uint(packHalf2x16(r.zw));       \
+    STORE1(name, offset << uint(1), datastore1); \
+    STORE1(name, (offset << uint(1)) + uint(1), datastore2)
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(res, a, b) ((res) = (a) + (b))
+#define POOL_OP_float(res, a, b) (res = a + b)
+#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(res, a, b)        \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }                             \
+    if(isnan(a.z) || (a.z < b.z)) \
+    {                             \
+        res.z = b.z;              \
+    }                             \
+    if(isnan(a.w) || (a.w < b.w)) \
+    {                             \
+        res.w = b.w;              \
+    }
+#define POOL_OP_float(res, a, b) \
+    (res) = (a);                 \
+    if(isnan(a) || (a < b))      \
+    {                            \
+        res = b;                 \
+    }
+#define POOL_OP_vec2(res, a, b)   \
+    (res) = (a);                  \
+    if(isnan(a.x) || (a.x < b.x)) \
+    {                             \
+        res.x = b.x;              \
+    }                             \
+    if(isnan(a.y) || (a.y < b.y)) \
+    {                             \
+        res.y = b.y;              \
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(POOL_SIZE)
+// Set the initial value for the pooling operation accordingly with the data type
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define INITIAL_VALUE 0.0f
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define INITIAL_VALUE -65504.0f
+#endif //POOL_AVG
+#endif //POOL_SIZE
+
+#define POOLING3x3_STRIDE1_fp16(res, input, output)                                                                \
+    vec4 data00;                                                                                                   \
+    vec2 data01;                                                                                                   \
+    vec4 data10;                                                                                                   \
+    vec2 data11;                                                                                                   \
+    vec4 data20;                                                                                                   \
+    vec2 data21;                                                                                                   \
+    LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                        \
+    LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                        \
+    LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                  \
+    LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                        \
+    data00 = POW2_OP(data00, 4);                                                                                   \
+    data01 = POW2_OP(data01, 2);                                                                                   \
+    data10 = POW2_OP(data10, 4);                                                                                   \
+    data11 = POW2_OP(data11, 2);                                                                                   \
+    data20 = POW2_OP(data20, 4);                                                                                   \
+    data21 = POW2_OP(data21, 2);                                                                                   \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data00.xyzy;                                                                                  \
+    values001.xyzw = data00.zwzw;                                                                                  \
+    values010.x    = data01.x;                                                                                     \
+    values010.y    = data00.w;                                                                                     \
+    values010.zw   = data01.xy;                                                                                    \
+    values100.xyzw = data10.xyzy;                                                                                  \
+    values101.xyzw = data10.zwzw;                                                                                  \
+    values11.x     = data11.x;                                                                                     \
+    values11.y     = data10.w;                                                                                     \
+    values11.zw    = data11.xy;                                                                                    \
+    values200.xyzw = data20.xyzy;                                                                                  \
+    values201.xyzw = data20.zwzw;                                                                                  \
+    values21.x     = data21.x;                                                                                     \
+    values21.y     = data20.w;                                                                                     \
+    values21.zw    = data21.xy;                                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE2_fp16(res, input, output)                                                                \
+    vec4  data000;                                                                                                 \
+    vec4  data001;                                                                                                 \
+    float data010;                                                                                                 \
+    vec4  data100;                                                                                                 \
+    vec4  data101;                                                                                                 \
+    float data11;                                                                                                  \
+    vec4  data200;                                                                                                 \
+    vec4  data201;                                                                                                 \
+    float data21;                                                                                                  \
+    vec2  datamiddle0;                                                                                             \
+    vec2  datamiddle1;                                                                                             \
+    vec2  datamiddle2;                                                                                             \
+    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                       \
+    datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));             \
+    data010     = datamiddle0.x;                                                                                   \
+    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                       \
+    datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));             \
+    data11      = datamiddle1.x;                                                                                   \
+    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                 \
+    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                       \
+    datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));             \
+    data21      = datamiddle2.x;                                                                                   \
+    data000     = POW2_OP(data000, 4);                                                                             \
+    data001     = POW2_OP(data001, 4);                                                                             \
+    data010     = POW2_OP(data010, 1);                                                                             \
+    data100     = POW2_OP(data100, 4);                                                                             \
+    data101     = POW2_OP(data101, 4);                                                                             \
+    data11      = POW2_OP(data11, 1);                                                                              \
+    data200     = POW2_OP(data200, 4);                                                                             \
+    data201     = POW2_OP(data201, 4);                                                                             \
+    data21      = POW2_OP(data21, 1);                                                                              \
+    \
+    vec4 values000;                                                                                                \
+    vec4 values001;                                                                                                \
+    vec4 values010;                                                                                                \
+    vec4 values100;                                                                                                \
+    vec4 values101;                                                                                                \
+    vec4 values11;                                                                                                 \
+    vec4 values200;                                                                                                \
+    vec4 values201;                                                                                                \
+    vec4 values21;                                                                                                 \
+    values000.xyzw = data000.xyzz;                                                                                 \
+    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
+    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
+    values100.xyzw = data100.xyzz;                                                                                 \
+    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
+    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
+    values200.xyzw = data200.xyzz;                                                                                 \
+    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
+    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
+    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
+    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
+    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
+    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
+    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
+
+#define POOLING3x3_STRIDE3_fp16(res, input, output)                                                    \
+    vec4 data000;                                                                                      \
+    vec4 data001;                                                                                      \
+    vec4 data010;                                                                                      \
+    vec4 data100;                                                                                      \
+    vec4 data101;                                                                                      \
+    vec4 data11;                                                                                       \
+    vec4 data200;                                                                                      \
+    vec4 data201;                                                                                      \
+    vec4 data21;                                                                                       \
+    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                     \
+    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));           \
+    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                     \
+    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));            \
+    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                     \
+    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));           \
+    LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));            \
+    data000 = POW2_OP(data000, 4);                                                                     \
+    data001 = POW2_OP(data001, 4);                                                                     \
+    data010 = POW2_OP(data010, 4);                                                                     \
+    data100 = POW2_OP(data100, 4);                                                                     \
+    data101 = POW2_OP(data101, 4);                                                                     \
+    data11  = POW2_OP(data11, 4);                                                                      \
+    data200 = POW2_OP(data200, 4);                                                                     \
+    data201 = POW2_OP(data201, 4);                                                                     \
+    data21  = POW2_OP(data21, 4);                                                                      \
+    \
+    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
+    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
+    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
+    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
+    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+
+vec2 load_and_unpack(Tensor3D src, uint offset)
+{
+    uint packed_s;
+    vec2 s;
+    LOAD1(packed_s, src, offset);
+
+    s = vec2(unpackHalf2x16(packed_s));
+    return s;
+}
+
+vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize maximum
+    vec2 data_max = vec2(0);
+
+    //Load and Set initial maximum1
+    vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
+    data_max.x      = data_init1.x;
+
+    //Load and Set initial maximum2
+    if(end_x1 < upper_bound_w)
+    {
+        if((stride_x % 2) == 0)
+        {
+            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
+            data_max.y      = data_init2.x;
+        }
+        else
+        {
+            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
+            data_max.y      = data_init2.y;
+        }
+    }
+
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            //Calculate maximum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                vec2  data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+                float data_mr1;
+                POOL_OP_float(data_mr1, data1.x, data1.y);
+                POOL_OP_float(data_max.x, data_max.x, data_mr1);
+            }
+            else
+            {
+                vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+                POOL_OP_float(data_max.x, data_max.x, data1.x);
+            }
+
+            //Calculate maximum2
+            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
+
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data2.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.x);
+                    }
+                }
+                else
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data3.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.y);
+                    }
+                }
+            }
+        }
+    return data_max;
+}
+
+vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize sum
+    float data_total1 = float(0);
+    float data_total2 = float(0);
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+            //Calculate sum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                data_total1 = data_total1 + data1.x + data1.y;
+            }
+            else
+            {
+                data_total1 = data_total1 + data1.x;
+            }
+
+            //Calculate sum2
+            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data2.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.x;
+                    }
+                }
+                else
+                {
+                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
+                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+                    data3 = POW2_OP(data3, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data3.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.y;
+                    }
+                }
+            }
+        }
+    //Calculate average
+    vec2 data_avg;
+    data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
+    data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
+
+    return data_avg;
+}
+
+#ifdef POOLING_LAYER_2
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3)
+/** Performs a pooling function of pool size equal to 3.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    vec4 res;
+    // Perform pooling 3x3 for 4 output elements
+#if STRIDE_X == 1
+    POOLING3x3_STRIDE1_fp16(res, src, dst);
+#elif STRIDE_X == 2
+    POOLING3x3_STRIDE2_fp16(res, src, dst);
+#elif STRIDE_X == 3
+    POOLING3x3_STRIDE3_fp16(res, src, dst);
+#endif /*STRIDE_X == 1*/
+
+    // Divide by pool region in case of average pooling
+#if defined(POOL_AVG) || defined(POOL_L2)
+    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
+    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
+    int   end_y   = min((start_y + 3), MAX_HEIGHT);
+    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
+}
+
+#elif defined(POOLING_LAYER_7)
+/** Performs a pooling function of pool size equal to 7.
+ *
+ * @note Supported data types are F16;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    //Load and calculate data
+    vec2 data;
+    uint res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+
+#elif defined(POOLING_LAYER_N)
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Supported data types are F16;
+ * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+
+    vec4 vdata00;
+    vdata00 = vec4(INITIAL_VALUE);
+    vec4 vdata01;
+    vdata01 = vec4(INITIAL_VALUE);
+    vec4 vdata10;
+    vdata10 = vec4(INITIAL_VALUE);
+    vec4 vdata11;
+    vdata11 = vec4(INITIAL_VALUE);
+    vec2 sdata;
+    sdata = vec2(INITIAL_VALUE);
+
+    for(int y = 0; y < int(POOL_SIZE); y++)
+    {
+        int x = 0;
+        for(; x <= (int(POOL_SIZE) - 8); x += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata00, vdata00, data2);
+            POOL_OP(vdata10, vdata10, data3);
+        }
+
+        // Leftover
+        for(; x < int(POOL_SIZE); x = x + 2)
+        {
+            vec2 data4middle;
+            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+            if((x + 1) >= int(POOL_SIZE))
+            {
+                POOL_OP_float(sdata.x, sdata.x, data4middle.x);
+            }
+            else
+            {
+                float data4;
+                POOL_OP_float(data4, data4middle.x, data4middle.y);
+                POOL_OP_float(sdata.x, sdata.x, data4);
+            }
+        }
+    }
+
+    for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++)
+    {
+        int x1 = STRIDE_X;
+        for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+        {
+            vec4 data2;
+            vec4 data3;
+            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data2 *= data2;
+            data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+            POOL_OP(vdata01, vdata01, data2);
+            POOL_OP(vdata11, vdata11, data3);
+        }
+
+        // Leftover
+        for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
+        {
+            vec2 data4middle;
+            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+            if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+            {
+                POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+            }
+            else
+            {
+                float data4;
+                POOL_OP_float(data4, data4middle.x, data4middle.y);
+                POOL_OP_float(sdata.y, sdata.y, data4);
+            }
+        }
+    }
+
+    //Reduce result
+    vec4 reduce40;
+    POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
+    vec2 reduce20;
+    POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
+    vec4 reduce41;
+    POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
+    vec2 reduce21;
+    POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
+    vec2 data;
+    POOL_OP_float(data.x, reduce20.x, reduce20.y);
+    POOL_OP_float(data.x, data.x, sdata.x);
+    POOL_OP_float(data.y, reduce21.x, reduce21.y);
+    POOL_OP_float(data.y, data.y, sdata.y);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    {
+        // Divide by pool region in case of average pooling
+        int  start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+        int  start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int  end_x1   = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
+        int  end_y1   = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
+        int  start_x2 = start_x1 + STRIDE_X;
+        int  start_y2 = start_y1;
+        int  end_x2   = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
+        int  end_y2   = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+        vec2 res1;
+        res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
+        res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
+        data.x = DIV_OP(data.x, res1.x);
+        data.y = DIV_OP(data.y, res1.y);
+    }
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+    uint res;
+    res = uint(packHalf2x16(data));
+
+    // Store result
+    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+}
+#endif /*POOLING_LAYER_2*/
+#endif /*DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
new file mode 100644
index 0000000..c9fabc5
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs

@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+// Common definitions
+#define MAX_OP(x, y) max((x), (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define SUB_OP(x, y) ((x) - (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define EXP_OP(x) exp((x))
+
+const float float_min = -1.0 / 0.0;
+const vec4  vec4_min  = vec4(float_min);
+
+#ifdef SOFTMAX_LAYER_MAX
+
+/** Identifies the maximum value across the 1st dimension.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
+ *
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ * @param[in]  width     Input image width
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    uint               width;
+};
+
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    // Initialize local maximum
+    vec4 max_val = vec4_min;
+
+    // Calculate max of row
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
+    {
+        vec4 data[2];
+        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
+        max_val = MAX_OP(data[0], max_val);
+        max_val = MAX_OP(data[1], max_val);
+    }
+
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    for(int i = int(width3 << 3); i < int(width); i++)
+    {
+        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        max_val.x  = MAX_OP(data, max_val.x);
+    }
+#endif /* NON_MULTIPLE_OF_8 */
+
+    // Perform max reduction
+    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+    max_val.x  = MAX_OP(max_val.x, max_val.y);
+
+    // Store result
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, max_val.x);
+}
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    // Initialize local maximum
+    vec4 max_val = vec4_min;
+
+    // Calculate max of row
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
+    {
+        vec4 data[2];
+        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        max_val = MAX_OP(data[0], max_val);
+        max_val = MAX_OP(data[1], max_val);
+    }
+
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    uint width1 = width >> 1 << 1;
+    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
+    {
+        vec2 data  = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        max_val.xy = MAX_OP(data, max_val.xy);
+    }
+    if(width != width1)
+    {
+        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0));
+        max_val.x = MAX_OP(data.x, max_val.x);
+    }
+#endif /* NON_MULTIPLE_OF_8 */
+
+    // Perform max reduction
+    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+    max_val.x  = MAX_OP(max_val.x, max_val.y);
+
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, max_val.xy);
+}
+#else  // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
+ *
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[in]  max_ptr   Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_attrs The attributes of the max values tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ * @param[out] sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_attrs The attributes of the sum values tensor
+ * @param[in]  width     Input image width
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes max_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes sum_attrs;
+    uint               width;
+};
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
+
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
+
+    // Load max value of 1D logits vector (row)
+    vec4 max_val = vec4(LOAD_CURRENT_ITEM(max_ptr, max_iter));
+
+    // Set sum vector
+    vec4 sum1D = vec4(0);
+
+    // Shift values, exp and sum
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
+    {
+        vec4 data[2];
+        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
+        data[0] = SUB_OP(data[0], max_val);
+        data[1] = SUB_OP(data[1], max_val);
+        data[0] = EXP_OP(data[0]);
+        data[1] = EXP_OP(data[1]);
+        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data[0]);
+        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, (i << 3) + 4, 0), data[1]);
+        sum1D = ADD_OP(sum1D, data[0]);
+        sum1D = ADD_OP(sum1D, data[1]);
+    }
+
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    for(int i = int(width3 << 3); i < int(width); i++)
+    {
+        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        data       = SUB_OP(data, max_val.x);
+        data       = EXP_OP(data);
+        STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+        sum1D.x = ADD_OP(sum1D.x, data);
+    }
+#endif /* NON_MULTIPLE_OF_8 */
+
+    // Perform min/max reduction
+    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
+
+    // Calculate and store result
+    STORE_CURRENT_ITEM(sum_ptr, sum_iter, sum1D.x);
+}
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
+
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
+
+    // Load max value of 1D logits vector (row)
+    vec2 datamaxinit = LOAD_UNPACK2_CURRENT_ITEM_HALF(max_ptr, max_iter);
+    vec4 max_val     = vec4(datamaxinit.x);
+
+    // Set sum vector
+    vec4 sum1D = vec4(0.f);
+
+    // Shift values, exp and sum
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
+    {
+        vec4 data[2];
+        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[0] = SUB_OP(data[0], max_val);
+        data[1] = SUB_OP(data[1], max_val);
+        data[0] = EXP_OP(data[0]);
+        data[1] = EXP_OP(data[1]);
+        VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
+        sum1D = ADD_OP(sum1D, data[0]);
+        sum1D = ADD_OP(sum1D, data[1]);
+    }
+
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    uint width1 = width >> 1 << 1;
+    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
+    {
+        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        data      = SUB_OP(data, max_val.xy);
+        data      = EXP_OP(data);
+        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+        sum1D.xy = ADD_OP(sum1D.xy, data);
+    }
+    if(width != width1)
+    {
+        float data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0)).x;
+        data       = SUB_OP(data, max_val.x);
+        data       = EXP_OP(data);
+        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, width1, 0), vec2(data, 0.0));
+        sum1D.x = ADD_OP(sum1D.x, data);
+    }
+#endif /* NON_MULTIPLE_OF_8 */
+    // Perform min/max reduction
+    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
+
+    // Calculate and store result
+    STORE_PACK2_CURRENT_ITEM_HALF(sum_ptr, sum_iter, sum1D.xy);
+}
+#else  // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_NORM)
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[in]  sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_attrs The attributes of the sum values tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes sum_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
+
+    // Load max value of 1D logits vector (row)
+    vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
+
+    vec4 data[2];
+    data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), DIV_OP(data[0], sum_val));
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 4, 0), DIV_OP(data[1], sum_val));
+}
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+void main(void)
+{
+    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
+
+    // Load max value of 1D logits vector (row)
+    vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
+
+    vec4 data[2];
+    data = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    vec4 ret[2];
+    ret[0] = DIV_OP(data[0], sum_val);
+    ret[1] = DIV_OP(data[1], sum_val);
+    VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), ret);
+}
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#endif // SOFTMAX_LAYER_MAX

diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
new file mode 100644
index 0000000..f8ad303
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs

@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+#include "helpers.h"
+
+#define SWAP_ROW_func(u0, l0) \
+    {                         \
+        tmp_swap = u0;        \
+        u0       = l0;        \
+        l0       = tmp_swap;  \
+    }
+
+#define SWAP_4x4_func(u0, u1, u2, u3, l0, l1, l2, l3) \
+    {                                                 \
+        vec4 tmp_swap;                                \
+        SWAP_ROW_func(u0, l0);                        \
+        SWAP_ROW_func(u1, l1);                        \
+        SWAP_ROW_func(u2, l2);                        \
+        SWAP_ROW_func(u3, l3);                        \
+    }
+
+#define TRANSPOSE_4x4_func(u0, u1, u2, u3) \
+    {                                      \
+        mat4x4 matin, matout;              \
+        matin[0] = u0;                     \
+        matin[1] = u1;                     \
+        matin[2] = u2;                     \
+        matin[3] = u3;                     \
+        matout   = transpose(matin);       \
+        u0       = matout[0];              \
+        u1       = matout[1];              \
+        u2       = matout[2];              \
+        u3       = matout[3];              \
+    }
+
+/** This OpenGL ES kernel computes the matrix transposition of input matrix
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note Optimization name must be passed using "#define OPTIMIZATION_NAME" for F16. e.g. "#define TRANSPOSE_8X8"
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+
+layout(std140) uniform shader_params
+{
+    IMAGE_PARAM_DECLARATION(src);
+    IMAGE_PARAM_DECLARATION(dst);
+};
+
+#ifdef DATA_TYPE_FP32
+precision highp float;
+
+BUFFER_DECLARATION(src, 1, float, readonly);
+BUFFER_DECLARATION(dst, 2, float, writeonly);
+
+#define LOAD16(r, name, offset)              \
+    {                                        \
+        r.x = LOAD4(name, offset);           \
+        r.y = LOAD4(name, offset + uint(1)); \
+        r.z = LOAD4(name, offset + uint(2)); \
+        r.w = LOAD4(name, offset + uint(3)); \
+    }
+
+#define STORE16(name, offset, r)             \
+    {                                        \
+        STORE4(name, offset, r.x);           \
+        STORE4(name, offset + uint(1), r.y); \
+        STORE4(name, offset + uint(2), r.z); \
+        STORE4(name, offset + uint(3), r.w); \
+    }
+
+void main(void)
+{
+    // compute source address
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // load the NxN block at (x, y)
+    vec4 u0;
+    vec4 u1;
+    vec4 u2;
+    vec4 u3;
+    LOAD16(u0, src, offset(src, 0, 0));
+    LOAD16(u1, src, offset(src, 0, 1));
+    LOAD16(u2, src, offset(src, 0, 2));
+    LOAD16(u3, src, offset(src, 0, 3));
+
+    // transpose the block
+    TRANSPOSE_4x4_func(u0, u1, u2, u3);
+
+    // store the block at (y, x)
+    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst.stride_y) + (dst.offset_first_element_in_bytes);
+
+    STORE16(dst, uint((dst_offset_in_bytes + uint(0) * dst.stride_y) >> 2), u0);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(1) * dst.stride_y) >> 2), u1);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(2) * dst.stride_y) >> 2), u2);
+    STORE16(dst, uint((dst_offset_in_bytes + uint(3) * dst.stride_y) >> 2), u3);
+}
+
+#elif defined(DATA_TYPE_FP16) /* DATA_TYPE_FP32 */
+precision mediump float;
+
+#if defined(TRANSPOSE_4X4)
+
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+
+void main(void)
+{
+    // compute source address
+    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // load the NxN block at (x, y)
+    vec4  u0;
+    vec4  u1;
+    vec4  u2;
+    vec4  u3;
+    uvec2 packed_s[4];
+
+    GC_LOAD1_2D_OFFSET(packed_s[0], src, 0, 0);
+    GC_LOAD1_2D_OFFSET(packed_s[1], src, 0, 1);
+    GC_LOAD1_2D_OFFSET(packed_s[2], src, 0, 2);
+    GC_LOAD1_2D_OFFSET(packed_s[3], src, 0, 3);
+
+    u0 = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+    u1 = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    u2 = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+    u3 = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+
+    // transpose the block
+    TRANSPOSE_4x4_func(u0, u1, u2, u3);
+
+    // store the block at (y, x)
+    uint dst_offset_in_bytes = uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+    dst.current_offset       = dst_offset_in_bytes;
+
+    packed_s[0] = uvec2(packHalf2x16(u0.xy), packHalf2x16(u0.zw));
+    packed_s[1] = uvec2(packHalf2x16(u1.xy), packHalf2x16(u1.zw));
+    packed_s[2] = uvec2(packHalf2x16(u2.xy), packHalf2x16(u2.zw));
+    packed_s[3] = uvec2(packHalf2x16(u3.xy), packHalf2x16(u3.zw));
+
+    GC_STORE1_2D_OFFSET(packed_s[0], dst, 0, 0);
+    GC_STORE1_2D_OFFSET(packed_s[1], dst, 0, 1);
+    GC_STORE1_2D_OFFSET(packed_s[2], dst, 0, 2);
+    GC_STORE1_2D_OFFSET(packed_s[3], dst, 0, 3);
+}
+
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+
+void main(void)
+{
+    // compute source address
+    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    vec4  u[8][2];
+    uvec4 packed_s[8];
+
+    for(int i = 0; i < 8; i++)
+    {
+        GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
+        u[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
+        u[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+    }
+
+    // transpose the block
+    TRANSPOSE_4x4_func(u[0][0], u[1][0], u[2][0], u[3][0]);
+    TRANSPOSE_4x4_func(u[0][1], u[1][1], u[2][1], u[3][1]);
+    TRANSPOSE_4x4_func(u[4][0], u[5][0], u[6][0], u[7][0]);
+    TRANSPOSE_4x4_func(u[4][1], u[5][1], u[6][1], u[7][1]);
+    SWAP_4x4_func(u[0][1], u[1][1], u[2][1], u[3][1], u[4][0], u[5][0], u[6][0], u[7][0]);
+
+    // store the block at (y, x)
+    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+    dst.current_offset       = dst_offset_in_bytes;
+
+    for(int i = 0; i < 8; i++)
+    {
+        packed_s[i] = uvec4(packHalf2x16(u[i][0].xy), packHalf2x16(u[i][0].zw), packHalf2x16(u[i][1].xy), packHalf2x16(u[i][1].zw));
+        GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+    }
+}
+
+#elif defined(TRANSPOSE_8X8_SQUARE) /* TRANSPOSE_4X4 */
+
+BUFFER_DECLARATION(src, 1, uvec4, readonly);
+BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+
+void main(void)
+{
+    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+
+    if(gl_GlobalInvocationID.x <= gl_GlobalInvocationID.y)
+    {
+        uint blk1_offset_in_bytes = src.current_offset;
+        uint blk2_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+
+        // load block1
+        vec4  u1[8][2];
+        uvec4 packed_s[8];
+
+        src.current_offset = blk1_offset_in_bytes;
+        for(int i = 0; i < 8; i++)
+        {
+            GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
+            u1[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
+            u1[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+        }
+
+        // transpose block1
+        TRANSPOSE_4x4_func(u1[0][0], u1[1][0], u1[2][0], u1[3][0]);
+        TRANSPOSE_4x4_func(u1[0][1], u1[1][1], u1[2][1], u1[3][1]);
+        TRANSPOSE_4x4_func(u1[4][0], u1[5][0], u1[6][0], u1[7][0]);
+        TRANSPOSE_4x4_func(u1[4][1], u1[5][1], u1[6][1], u1[7][1]);
+        SWAP_4x4_func(u1[0][1], u1[1][1], u1[2][1], u1[3][1], u1[4][0], u1[5][0], u1[6][0], u1[7][0]);
+
+        // write to block2
+        dst.current_offset = blk2_offset_in_bytes;
+        for(int i = 0; i < 8; i++)
+        {
+            packed_s[i] = uvec4(packHalf2x16(u1[i][0].xy), packHalf2x16(u1[i][0].zw), packHalf2x16(u1[i][1].xy), packHalf2x16(u1[i][1].zw));
+            GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+        }
+
+        // load block2
+        vec4 u2[8][2];
+
+        src.current_offset = blk2_offset_in_bytes;
+        for(int i = 0; i < 8; i++)
+        {
+            GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
+            u2[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
+            u2[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+        }
+
+        // transpose block2
+        TRANSPOSE_4x4_func(u2[0][0], u2[1][0], u2[2][0], u2[3][0]);
+        TRANSPOSE_4x4_func(u2[0][1], u2[1][1], u2[2][1], u2[3][1]);
+        TRANSPOSE_4x4_func(u2[4][0], u2[5][0], u2[6][0], u2[7][0]);
+        TRANSPOSE_4x4_func(u2[4][1], u2[5][1], u2[6][1], u2[7][1]);
+        SWAP_4x4_func(u2[0][1], u2[1][1], u2[2][1], u2[3][1], u2[4][0], u2[5][0], u2[6][0], u2[7][0]);
+
+        // write to block1
+        dst.current_offset = blk1_offset_in_bytes;
+        for(int i = 0; i < 8; i++)
+        {
+            packed_s[i] = uvec4(packHalf2x16(u2[i][0].xy), packHalf2x16(u2[i][0].zw), packHalf2x16(u2[i][1].xy), packHalf2x16(u2[i][1].zw));
+            GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+        }
+    }
+}
+
+#endif /* TRANSPOSE_4X4 */
+
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/GLES_COMPUTE/egl_entries.in
similarity index 67%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/GLES_COMPUTE/egl_entries.in
index 37857b6..64ccda6 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/GLES_COMPUTE/egl_entries.in

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
+EGL_ENTRY(eglGetProcAddress)
+EGL_ENTRY(eglBindAPI)
+EGL_ENTRY(eglChooseConfig)
+EGL_ENTRY(eglCreateContext)
+EGL_ENTRY(eglDestroyContext)
+EGL_ENTRY(eglGetDisplay)
+EGL_ENTRY(eglInitialize)
+EGL_ENTRY(eglMakeCurrent)
+EGL_ENTRY(eglTerminate)
+EGL_ENTRY(eglGetError)
+EGL_ENTRY(eglQueryString)

diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
new file mode 100644
index 0000000..15ce8ee
--- /dev/null
+++ b/src/core/GLES_COMPUTE/gl_entries.in

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+GL_ENTRY(glAttachShader)
+GL_ENTRY(glCompileShader)
+GL_ENTRY(glCreateProgram)
+GL_ENTRY(glCreateShader)
+GL_ENTRY(glDeleteProgram)
+GL_ENTRY(glDeleteShader)
+GL_ENTRY(glDetachShader)
+GL_ENTRY(glGetProgramInfoLog)
+GL_ENTRY(glGetProgramiv)
+GL_ENTRY(glGetShaderInfoLog)
+GL_ENTRY(glGetShaderiv)
+GL_ENTRY(glLinkProgram)
+GL_ENTRY(glShaderSource)
+GL_ENTRY(glUseProgram)
+GL_ENTRY(glBindBuffer)
+GL_ENTRY(glBindBufferBase)
+GL_ENTRY(glBufferData)
+GL_ENTRY(glDeleteBuffers)
+GL_ENTRY(glDispatchCompute)
+GL_ENTRY(glFlush)
+GL_ENTRY(glGenBuffers)
+GL_ENTRY(glGetProgramResourceIndex)
+GL_ENTRY(glGetUniformLocation)
+GL_ENTRY(glMapBufferRange)
+GL_ENTRY(glMemoryBarrier)
+GL_ENTRY(glUniform1ui)
+GL_ENTRY(glUnmapBuffer)
+GL_ENTRY(glGetError)
+GL_ENTRY(glGetActiveUniformBlockiv)
+GL_ENTRY(glUniformBlockBinding)
+GL_ENTRY(glGetUniformBlockIndex)
+GL_ENTRY(glGenTextures)
+GL_ENTRY(glDeleteTextures)
+GL_ENTRY(glBindTexture)
+GL_ENTRY(glTexImage2D)
+GL_ENTRY(glGenFramebuffers)
+GL_ENTRY(glDeleteFramebuffers)
+GL_ENTRY(glBindFramebuffer)
+GL_ENTRY(glFramebufferTexture2D)

diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000..c0f454d
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCAbsoluteDifferenceKernel::GCAbsoluteDifferenceKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void GCAbsoluteDifferenceKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input1_access(input1->info(), 0, 0, 4, 1);
+    AccessWindowRectangle input2_access(input2->info(), 0, 0, 4, 1);
+    AccessWindowRectangle output_access(output->info(), 0, 0, 4, 1);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    IGCKernel::configure(win);
+}
+
+void GCAbsoluteDifferenceKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        add_2D_tensor_argument(idx, _input1, binding++, slice);
+        add_2D_tensor_argument(idx, _input2, binding++, slice);
+        add_2D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
new file mode 100644
index 0000000..b8672c6
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCActivationLayerKernel::GCActivationLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCActivationLayerKernel::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    _input  = input;
+    _output = input;
+
+    if(output != nullptr)
+    {
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        _output = output;
+    }
+
+    unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + string_from_activation_func(act_info.activation())));
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
+    build_opts.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                                  output_access);
+
+        output_access.set_valid_region(win, input->info()->valid_region());
+    }
+    else
+    {
+        update_window_and_padding(win,
+                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+    }
+
+    IGCKernel::configure(win);
+}
+
+void GCActivationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..dee2a55
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f)
+{
+}
+
+void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    _input   = input;
+    _output  = output;
+    _mean    = mean;
+    _var     = var;
+    _beta    = beta;
+    _gamma   = gamma;
+    _epsilon = epsilon;
+
+    unsigned int num_elems_processed_per_iteration = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+    }
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace(("#define ESPILON " + float_to_string_with_full_precision(_epsilon)));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1));
+    AccessWindowStatic     var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1));
+    AccessWindowStatic     beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1));
+    AccessWindowStatic     gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCBatchNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, 3, vector_slice);
+    add_1D_tensor_argument(idx, _var, 4, vector_slice);
+    add_1D_tensor_argument(idx, _beta, 5, vector_slice);
+    add_1D_tensor_argument(idx, _gamma, 6, vector_slice);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
new file mode 100644
index 0000000..492f708
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+GCCol2ImKernel::GCCol2ImKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
+                               std::pair<unsigned int, unsigned int> convolved_dims)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input          = input;
+    _output         = output;
+    _convolved_dims = convolved_dims;
+
+    // Create kernel
+    std::set<std::string>  build_opts;
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define COL2IM");
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+    _kernel.set_argument(idx++, _convolved_dims.first);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCCol2ImKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_2D();
+    Window slice_out = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        // Set inputs
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_2D_tensor_argument(idx, _input, binding++, slice_in);
+        add_3D_tensor_argument(idx, _output, binding++, slice_out);
+        _kernel.update_shader_params();
+        enqueue(*this, slice_in);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
new file mode 100644
index 0000000..a611178
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp

@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
+    : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize GCDepthConcatenateLayerKernel::border_size() const
+{
+    return BorderSize(_top_bottom, _left_right);
+}
+
+void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+    // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+    // Otherwise it is not clear how the padding should be added onto the input tensor
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+    _input  = input;
+    _output = output;
+
+    // Add build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Configure kernel window
+    _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+    _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+    const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2];
+
+    build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right));
+    build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom));
+    build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
+
+    unsigned int num_elems_processed_per_iteration = 1;
+    unsigned int num_elems_read_per_iteration      = 1;
+    if(input->info()->data_type() == DataType::F32)
+    {
+        num_elems_processed_per_iteration = 1;
+        num_elems_read_per_iteration      = 1;
+    }
+    else if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_processed_per_iteration = 4;
+        num_elems_read_per_iteration      = 4;
+    }
+    const unsigned int num_rows_read_per_iteration = 1;
+
+    // The window needs to be based on input as we copy all the depths of input
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
+
+    AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCDepthConcatenateLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        if(_input->info()->data_type() == DataType::F32)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, 1, slice);
+            add_3D_tensor_argument(idx, _output, 2, slice);
+        }
+        else if(_input->info()->data_type() == DataType::F16)
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
+            add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+        }
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
new file mode 100644
index 0000000..ca673ea
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+template <unsigned int kernel_size>
+GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
+    : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1U, 1U, 1U))
+{
+}
+
+template <unsigned int kernel_size>
+BorderSize             GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
+{
+    return _border_size;
+}
+
+template <unsigned int kernel_size>
+void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
+    ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+    }
+
+    // Get convolved dimensions
+    unsigned int owidth  = 0;
+    unsigned int oheight = 0;
+    std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, owidth);
+    output_shape.set(1, oheight);
+    output_shape.set(2, weights->info()->dimension(3));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _conv_stride_x = std::get<0>(conv_info.stride());
+    _conv_stride_y = std::get<1>(conv_info.stride());
+    _conv_pad_x    = std::get<0>(conv_info.pad());
+    _conv_pad_y    = std::get<1>(conv_info.pad());
+
+    _input       = input;
+    _weights     = weights;
+    _output      = output;
+    _bias        = bias;
+    _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
+
+    std::set<std::string> options;
+
+    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+    options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
+    options.emplace("#define STRIDE_Y " + support::cpp11::to_string(_conv_stride_y));
+
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    options.emplace(("#define " + dt_name));
+
+    unsigned int num_elems_read_per_iteration_x    = kernel_size * _conv_stride_x;
+    unsigned int num_elems_read_per_iteration_y    = 1;
+    unsigned int num_elems_written_per_iteration_x = 1;
+    unsigned int num_elems_written_per_iteration_y = 1;
+    unsigned int num_elems_written_per_iteration_z = 1;
+
+    if(kernel_size == 3)
+    {
+        if((_conv_stride_x == 1) && (_conv_stride_y == 1))
+        {
+            switch(input->info()->data_type())
+            {
+#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
+
+                case DataType::F16:
+#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 16;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 8;
+                    num_elems_written_per_iteration_y = 3;
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 6;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 4;
+#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+                    num_elems_written_per_iteration_z = 2;
+#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
+#undef PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16
+#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
+#undef PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16
+#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16
+                    break;
+
+                case DataType::F32:
+                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_read_per_iteration_y    = 5;
+                    num_elems_written_per_iteration_x = 4;
+                    num_elems_written_per_iteration_y = 3;
+                    break;
+
+                default:
+                    ARM_COMPUTE_ERROR("Current data type is not supported");
+                    break;
+            }
+        }
+        else
+        {
+            switch(input->info()->data_type())
+            {
+                case DataType::F16:
+                    options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_written_per_iteration_x = 4;
+                    break;
+
+                case DataType::F32:
+#define PROCESS_4_ELEMENT
+
+#if defined(PROCESS_1_ELEMENT)
+                    options.emplace("#define PROCESS_1_ELEMENT");
+                    num_elems_read_per_iteration_x    = 3;
+                    num_elems_written_per_iteration_x = 1;
+#elif defined(PROCESS_4_ELEMENT)
+                    options.emplace("#define PROCESS_4_ELEMENT");
+                    num_elems_read_per_iteration_x    = 8;
+                    num_elems_written_per_iteration_x = 4;
+#elif defined(PROCESS_8_ELEMENT)
+                    options.emplace("#define PROCESS_8_ELEMENT");
+                    num_elems_read_per_iteration_x    = 12;
+                    num_elems_written_per_iteration_x = 8;
+#else /* PROCESS_1_ELEMENT */
+#error Have to declare how many elements to process in one thread.
+#endif /* PROCESS_1_ELEMENT */
+#undef PROCESS_1_ELEMENT
+#undef PROCESS_4_ELEMENT
+#undef PROCESS_8_ELEMENT
+                    break;
+
+                default:
+                    ARM_COMPUTE_ERROR("Current data type is not supported");
+                    break;
+            }
+        }
+    }
+    else if(kernel_size == 1)
+    {
+        if(weights->info()->dimension(2) % 2 == 0)
+        {
+            options.emplace("#define WEIGHTS_OPTIMIZATION");
+        }
+        switch(input->info()->data_type())
+        {
+            case DataType::F16:
+#define PROCESS_8X_2Y_1Z
+
+#if defined(PROCESS_4X_1Y_1Z)
+                options.emplace("#define PROCESS_4X_1Y_1Z");
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_written_per_iteration_x = 4;
+#elif defined(PROCESS_4X_2Y_1Z)
+                options.emplace("#define PROCESS_4X_2Y_1Z");
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 2;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 2;
+#elif defined(PROCESS_4X_3Y_1Z)
+                options.emplace("#define PROCESS_4X_3Y_1Z");
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 3;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 3;
+#elif defined(PROCESS_4X_4Y_1Z)
+                options.emplace("#define PROCESS_4X_4Y_1Z");
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 4;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 4;
+#elif defined(PROCESS_4X_2Y_2Z)
+                ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(4) % 2) == 1, "Current 'weights->info()->dimension(4) % 2) == 1' is not supported");
+                options.emplace("#define PROCESS_4X_2Y_2Z");
+                num_elems_read_per_iteration_x    = 4;
+                num_elems_read_per_iteration_y    = 2;
+                num_elems_written_per_iteration_x = 4;
+                num_elems_written_per_iteration_y = 2;
+                num_elems_written_per_iteration_z = 2;
+#elif defined(PROCESS_8X_1Y_1Z)
+                options.emplace("#define PROCESS_8X_1Y_1Z");
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_written_per_iteration_x = 8;
+#elif defined(PROCESS_8X_2Y_1Z)
+                options.emplace("#define PROCESS_8X_2Y_1Z");
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_read_per_iteration_y    = 2;
+                num_elems_written_per_iteration_x = 8;
+                num_elems_written_per_iteration_y = 2;
+#else /* PROCESS_4X_1Y_1Z */
+#error Have to declare how many elements to process in one thread.
+#endif /* PROCESS_4X_1Y_1Z */
+#undef PROCESS_4X_1Y_1Z
+#undef PROCESS_4X_2Y_1Z
+#undef PROCESS_4X_3Y_1Z
+#undef PROCESS_4X_4Y_1Z
+#undef PROCESS_4X_2Y_2Z
+#undef PROCESS_8X_1Y_1Z
+#undef PROCESS_8X_2Y_1Z
+                break;
+
+            case DataType::F32:
+                num_elems_read_per_iteration_x    = 1;
+                num_elems_written_per_iteration_x = 1;
+                break;
+
+            default:
+                break;
+        }
+    }
+    else if(kernel_size == 5)
+    {
+        switch(input->info()->data_type())
+        {
+            case DataType::F16:
+                options.emplace("#define PROCESS_4X_1Y_1Z");
+                num_elems_read_per_iteration_x    = 8;
+                num_elems_written_per_iteration_x = 4;
+
+            default:
+                break;
+        }
+    }
+    else
+    {
+    }
+
+    if(_bias != nullptr)
+    {
+        options.emplace("#define BIAS");
+    }
+
+    std::stringstream kernel_name;
+    kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
+
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+    unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor());
+
+    // Calculate output right and bottom border
+    const int output_width          = output->info()->dimension(0);
+    const int output_height         = output->info()->dimension(1);
+    const int output_padding_right  = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
+    const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
+
+    // Calculate input right and bottom border
+    const int input_width    = input->info()->dimension(0);
+    const int input_height   = input->info()->dimension(1);
+    const int upper_bound_w  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
+    const int upper_bound_h  = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
+    const int padding_right  = std::max(upper_bound_w, _conv_pad_x);
+    const int padding_bottom = std::max(upper_bound_h, _conv_pad_y);
+
+    BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
+
+    Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
+
+    AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
+    AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
+
+    switch(weights->info()->data_type())
+    {
+        case DataType::F16:
+            if((weights->info()->dimension(2) % 2 != 0) || (kernel_size != 1))
+            {
+                weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size + 1, kernel_size);
+            }
+            if(_bias != nullptr)
+            {
+                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0) + 1, 1);
+            }
+            break;
+
+        case DataType::F32:
+            weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size, kernel_size);
+            if(_bias != nullptr)
+            {
+                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0), 1);
+            }
+            break;
+
+        default:
+            ARM_COMPUTE_ERROR("Current data type is not supported");
+            break;
+    }
+
+    AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+
+    if(_bias != nullptr)
+    {
+        update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, weights_access, output_access);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    _kernel.set_argument(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
+    _kernel.set_argument(idx++, _weights->info()->dimension(2));          // weights_depth
+
+    IGCKernel::configure(win);
+}
+
+template <unsigned int kernel_size>
+void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    // Get initial windows
+    Window slice  = window.first_slice_window_3D();
+    Window win_in = window;
+
+    win_in.adjust(Window::DimX, -_conv_pad_x, true);
+    win_in.adjust(Window::DimY, -_conv_pad_y, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in = win_in.first_slice_window_3D();
+
+    unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+    add_3D_tensor_argument(idx1, _weights, BufferParam(3, 2), slice);
+
+    if(_bias != nullptr)
+    {
+        Window slice_bias;
+        slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, _bias, BufferParam(4, 2), slice_bias);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+
+        switch(_input->info()->data_type())
+        {
+            case DataType::F16:
+                switch(kernel_size)
+                {
+                    case 1:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
+                        break;
+
+                    case 3:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+                        break;
+
+                    case 5:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
+                        break;
+                }
+                break;
+
+            case DataType::F32:
+                switch(kernel_size)
+                {
+                    case 1:
+                    case 5:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 2), slice);
+                        break;
+
+                    case 3:
+                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
+                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
+                        break;
+
+                    default:
+                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
+                        break;
+                }
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice, _lws);
+    }
+    while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
+}
+
+template class arm_compute::GCDirectConvolutionLayerKernel<1>;
+template class arm_compute::GCDirectConvolutionLayerKernel<3>;
+template class arm_compute::GCDirectConvolutionLayerKernel<5>;

diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
new file mode 100644
index 0000000..e87c902
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <random>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCDropoutLayerKernel::GCDropoutLayerKernel()
+    : _input(nullptr), _mask(nullptr), _output(nullptr), _num_elems_processed_per_iteration(0)
+{
+}
+
+void GCDropoutLayerKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
+
+    _input  = input;
+    _mask   = mask;
+    _output = output;
+
+    std::set<std::string>                 build_opts;
+    std::string                           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    std::string                           fporbp  = forward ? "FORWARD" : "BACKWARD";
+    std::random_device                    rd;
+    std::mt19937                          mt(rd());
+    std::uniform_real_distribution<float> dist(0.f, 1.f);
+
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define RATIO " + support::cpp11::to_string(ratio));
+    build_opts.emplace("#define SCALE " + support::cpp11::to_string(1. / (1. - ratio)));
+    build_opts.emplace("#define SEED " + support::cpp11::to_string(dist(mt)));
+    build_opts.emplace("#define " + dt_name);
+    build_opts.emplace("#define " + fporbp);
+
+    _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("dropout", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCDropoutLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
+        add_3D_tensor_argument(idx, _mask, BufferParam(2, 2), slice);
+        add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
new file mode 100644
index 0000000..b4efc0b
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp

@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCFillBorderKernel::GCFillBorderKernel()
+    : IGCKernel(), _tensor(nullptr)
+{
+}
+
+bool GCFillBorderKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <class T>
+void GCFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+    T value;
+    constant_border_value.get(value);
+    _kernel.set_argument(idx, static_cast<T>(value));
+}
+
+void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+    border_size.limit(tensor->info()->padding());
+
+    // If there is no border: early exit
+    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    {
+        return;
+    }
+
+    // Select appropriate kernel
+    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+    // Define build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define BORDER_SIZE_TOP " + support::cpp11::to_string(border_size.top));
+    build_opts.emplace("#define BORDER_SIZE_BOTTOM " + support::cpp11::to_string(border_size.bottom));
+    build_opts.emplace("#define BORDER_SIZE_LEFT " + support::cpp11::to_string(border_size.left));
+    build_opts.emplace("#define BORDER_SIZE_RIGHT " + support::cpp11::to_string(border_size.right));
+
+    if(border_mode == BorderMode::REPLICATE)
+    {
+        build_opts.emplace("#define FILL_IMAGE_BORDERS_REPLICATE\n");
+    }
+    else
+    {
+        build_opts.emplace("#define FILL_IMAGE_BORDERS_CONSTANT\n");
+    }
+
+    switch(tensor->info()->data_type())
+    {
+        case DataType::F16:
+            build_opts.emplace("#define DATA_TYPE_FP16");
+            break;
+
+        case DataType::F32:
+            build_opts.emplace("#define DATA_TYPE_FP32");
+            break;
+
+        default:
+            ARM_COMPUTE_ERROR("Current data type is not supported");
+            break;
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _tensor = tensor;
+
+    // Create static kernel arguments
+    const unsigned int valid_width       = tensor->info()->valid_region().shape[0];
+    const unsigned int valid_height      = tensor->info()->valid_region().shape[1];
+    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+    // Set static kernel arguments
+    unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
+    _kernel.set_argument(idx++, valid_width);
+    _kernel.set_argument(idx++, valid_height);
+    _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[0]);
+    _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[1]);
+
+    if(BorderMode::CONSTANT == border_mode)
+    {
+        set_constant_border<float>(idx++, constant_border_value);
+    }
+
+    // Configure kernel window
+    Window win;
+    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
+
+    IGCKernel::configure(win);
+}
+
+void GCFillBorderKernel::run(const Window &window)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel.get_program() == 0)
+    {
+        return;
+    }
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _tensor, 1, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000..4bc6731
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, input->info()->dimension(0) * 4);
+    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_INTERLEAVE4x4");
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts));
+
+    // Configure kernel window
+    const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input->info()->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMInterleave4x4Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    /*
+     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+     */
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = window.first_slice_window_2D();
+
+    // Change x and y steps for the slide of output tensor
+    out_slice.scale(Window::DimX, 4.f);
+    out_slice.scale(Window::DimY, 0.25f);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, in_slice);
+        add_2D_tensor_argument(idx, _output, 2, out_slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000..944585d
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp

@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel()
+    : _accum(nullptr), _biases(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
+{
+}
+
+void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+    _biases = biases;
+    _accum  = accum;
+
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_ACCUMULATE_BIASES");
+
+#define ACCUM_PROCESS_4X
+
+#if defined(ACCUM_PROCESS_4X)
+    build_opts.emplace("#define ACCUM_PROCESS_4X");
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+    build_opts.emplace("#define ACCUM_PROCESS_8X");
+#endif                          /* ACCUM_PROCESS_4X */
+    std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+
+    _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts);
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 1;
+
+    if(_accum->info()->data_type() == DataType::F32)
+    {
+        num_elems_processed_per_iteration = 16;
+    }
+    else if(_accum->info()->data_type() == DataType::F16)
+    {
+#if defined(ACCUM_PROCESS_4X)
+        num_elems_processed_per_iteration = 4;
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+        num_elems_processed_per_iteration = 8;
+#endif                          /* ACCUM_PROCESS_4X */
+    }
+
+    const int  accum_width         = accum->info()->dimension(0);
+    const int  accum_padding_right = ceil_to_multiple(accum_width, num_elems_processed_per_iteration * _lws[0]) - accum_width;
+    BorderSize border              = BorderSize(0, accum_padding_right, 0, 0);
+
+    Window win = calculate_max_enlarged_window(*_accum->info(), Steps(num_elems_processed_per_iteration), border);
+
+    AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration * _lws[0]), biases->info()->dimension(1));
+    AccessWindowStatic accum_access(_accum->info(), 0, 0, accum_width + accum_padding_right, _accum->info()->dimension(1));
+
+    update_window_and_padding(win, biases_access, accum_access);
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window accum_slice = window.first_slice_window_2D();
+
+    Window biases_slice(accum_slice);
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+        if(_accum->info()->data_type() == DataType::F32)
+        {
+            add_2D_tensor_argument(idx, _accum, 1, accum_slice);
+            add_1D_tensor_argument(idx, _biases, 2, biases_slice);
+        }
+        else if(_accum->info()->data_type() == DataType::F16)
+        {
+#if defined(ACCUM_PROCESS_4X)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
+            BufferParam param             = { 1, 4 };
+            add_2D_tensor_argument(idx, _accum, param, accum_slice);
+            param.binding_point = 2;
+            add_1D_tensor_argument(idx, _biases, param, biases_slice);
+#endif                          /* ACCUM_PROCESS_4X */
+        }
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, accum_slice, _lws);
+    }
+    while(window.slide_window_slice_2D(accum_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000..cf5d378
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMMatrixAdditionKernel::configure(const IGCTensor *input, IGCTensor *output, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+    _input                                               = input;
+    _output                                              = output;
+    const unsigned int num_elems_processed_per_iteration = max_gc_vector_width / data_size_from_type(input->info()->data_type());
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define BETA " + float_to_string_with_full_precision(beta));
+
+    // Create kernel
+    build_opts.emplace("#define GEMM_MATRIXADDITION");
+    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+    _kernel                    = GCKernelLibrary::get().create_kernel(("gemm_ma"), build_opts);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_2D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000..8179525
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp

@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+    }
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    Window                win;
+
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0)));
+    build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha));
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(is_interleaved_transposed)
+    {
+        switch(input0->info()->data_type())
+        {
+            case DataType::F16:
+                build_opts.emplace("#define DATA_TYPE_FP16");
+                break;
+
+            case DataType::F32:
+                build_opts.emplace("#define DATA_TYPE_FP32");
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
+
+        // Create kernel
+        _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts);
+
+        // Configure window kernel
+        const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
+        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+        unsigned int num_elems_processed_per_iteration_x;
+        unsigned int num_elems_processed_per_iteration_y;
+
+        switch(input0->info()->data_type())
+        {
+            case DataType::F16:
+                build_opts.emplace("#define DATA_TYPE_FP16");
+
+#define MM_PROCESS_4X_OPTIMIZED
+
+#if defined(MM_PROCESS_4X)
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X");
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 4;
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                num_elems_processed_per_iteration_x = 8;
+                num_elems_processed_per_iteration_y = 1;
+                build_opts.emplace("#define MM_PROCESS_8X");
+#endif                                 /* MM_PROCESS_4X */
+                break;
+
+            case DataType::F32:
+                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type());
+                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
+                build_opts.emplace("#define DATA_TYPE_FP32");
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y));
+
+        // Create kernel
+        _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts);
+
+        win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+#if defined(MM_PROCESS_4X_OPTIMIZED)
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
+#else  /* MM_PROCESS_4X_OPTIMIZED */
+        AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1),
+                                         num_elems_processed_per_iteration_y));
+#endif /* MM_PROCESS_4X_OPTIMIZED */
+        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
+        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->info()->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
+    }
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMMatrixMultiplyKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice          = window.first_slice_window_2D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() < 3)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        switch(_input0->info()->data_type())
+        {
+            case DataType::F16:
+#if defined(MM_PROCESS_4X)
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
+#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice);
+#endif                                 /* MM_PROCESS_4X */
+                break;
+
+            case DataType::F32:
+                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
+                add_2D_tensor_argument(idx, _input1, BufferParam(2, 2), slice_b);
+                add_2D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000..c361b60
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t transpose_w = 16 / input->info()->element_size();
+    output_shape.set(0, input->info()->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const int          scale_x                           = num_elems_processed_per_iteration;
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    /*
+     * Following an example of how the transposition1xW works when the input data type is F32
+     *
+     *         |a00 a01 a02 a03|
+     *         |a10 a11 a12 a13|
+     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+     *         |a30 a31 a32 a33|
+     *
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     */
+    // Create kernel
+    build_opts.emplace("#define GEMM_TRANSPOSE1xW");
+    _kernel = GCKernelLibrary::get().create_kernel("gemm_transpose1x4", build_opts);
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCGEMMTranspose1xWKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    // Output is transposed
+    Window out_window(window);
+    out_window.set(Window::DimX, window.y());
+    out_window.set(Window::DimY, window.x());
+
+    Window in_slice  = window.first_slice_window_2D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, in_slice);
+        add_2D_tensor_argument(idx, _output, 2, out_slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
new file mode 100644
index 0000000..e849891
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp

@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCIm2ColKernel::GCIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_UNUSED(kernel_dims);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define " + dt_name);
+
+    if(has_bias)
+    {
+        build_opts.emplace("#define HAS_BIAS");
+    }
+
+    int pad_x    = 0;
+    int pad_y    = 0;
+    int stride_x = 0;
+    int stride_y = 0;
+    std::tie(pad_x, pad_y)       = conv_info.pad();
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                    input->info()->tensor_shape().cend(),
+                                                    output->info()->tensor_shape().cbegin() + 1))
+                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+    if(!run_img2col_reduced)
+    {
+        // this path is currently not used and not validated
+        build_opts.insert("#define IM2COL_GENERIC");
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.first, kernel_dims.second,
+                                            conv_info);
+        _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+        build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.first));
+        build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.second));
+        build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2)));
+        build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first));
+        build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second));
+        build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first));
+        build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second));
+        build_opts.emplace("#define PAD_X " + support::cpp11::to_string(conv_info.pad().first));
+        build_opts.emplace("#define PAD_Y " + support::cpp11::to_string(conv_info.pad().second));
+        build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
+        build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
+
+        // Create kernel
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+        _run_func = &GCIm2ColKernel::run_generic;
+    }
+    else
+    {
+        build_opts.insert("#define IM2COL_REDUCED");
+
+        if(input->info()->data_type() == DataType::F32)
+        {
+            _num_elems_processed_per_iteration = 4 / input->info()->element_size();
+        }
+        else if(input->info()->data_type() == DataType::F16)
+        {
+            int input_width  = input->info()->dimension(0);
+            int input_height = input->info()->dimension(1);
+
+            build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
+            if(input_width % 8 == 0)
+            {
+                _num_elems_processed_per_iteration = 8;
+                build_opts.insert("#define IM2COL_REDUCED_8X");
+            }
+            else if(input_width % 4 == 0)
+            {
+                _num_elems_processed_per_iteration = 4;
+                build_opts.insert("#define IM2COL_REDUCED_4X");
+            }
+            else if(input_width % 2 == 0)
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_2X");
+            }
+            else
+            {
+                _num_elems_processed_per_iteration = 2;
+                build_opts.insert("#define IM2COL_REDUCED_GENERIC");
+            }
+        }
+
+        // Create kernel
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+
+        _run_func = &GCIm2ColKernel::run_reduced;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+
+    if(input->info()->data_type() == DataType::F16)
+    {
+        // Calculate input right and bottom border
+        AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+
+        // Calculate output right and bottom border
+        const int          output_width         = output->info()->dimension(0);
+        const int          output_height        = output->info()->dimension(1);
+        const int          output_padding_right = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
+        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height);
+
+        update_window_and_padding(win, input_access, output_access);
+    }
+
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    if(!run_img2col_reduced)
+    {
+        // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+        win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+    }
+
+    IGCKernel::configure(win);
+}
+
+void GCIm2ColKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+    (this->*_run_func)(window);
+}
+
+void GCIm2ColKernel::run_generic(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    // Get initial windows
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    // Change the Z dimension's step back to 1
+    window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+    Window slice     = window_collapsed.first_slice_window_3D();
+    Window slice_in  = window_collapsed.first_slice_window_3D();
+    Window slice_out = window_collapsed.first_slice_window_3D();
+
+    // Setup slice
+    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+
+    // Setup input slice
+    // The first three dimensions of the input are increased by the inner loops
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_2D_tensor_argument(idx, _output, 2, slice_out);
+
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+        _kernel.set_argument(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+}
+
+void GCIm2ColKernel::run_reduced(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+    Window out_slice = out_window.first_slice_window_1D();
+    Window in_slice  = window.first_slice_window_3D();
+
+    _kernel.use();
+
+    // Run kernel
+    do
+    {
+        // Set arguments
+        unsigned int idx = 0;
+
+        add_3D_tensor_argument(idx, _input, 1, in_slice);
+        add_1D_tensor_argument(idx, _output, 2, out_slice);
+        _kernel.set_argument(idx++, _input->info()->dimension(0));
+        _kernel.set_argument(idx++, _input->info()->dimension(1));
+        _kernel.update_shader_params();
+
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
new file mode 100644
index 0000000..5dad767
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+GCNormalizationLayerKernel::GCNormalizationLayerKernel()
+    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize GCNormalizationLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void GCNormalizationLayerKernel::configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+    // Set build options
+    std::set<std::string> build_opts;
+
+    _input         = input;
+    _squared_input = squared_input;
+    _output        = output;
+
+    const bool         is_in_map    = norm_info.is_in_map();
+    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+    _border_size                    = BorderSize(0, border_width);
+
+    // Set kernel static arguments
+    std::string func_name = ((norm_info.type() == NormType::IN_MAP_1D) ? "IN_MAP_1D" : "CROSS_MAP");
+    build_opts.emplace(("#define " + func_name));
+    build_opts.emplace(("#define COEFF " + float_to_string_with_full_precision(norm_info.scale_coeff())));
+    build_opts.emplace(("#define BETA " + float_to_string_with_full_precision(norm_info.beta())));
+    build_opts.emplace(("#define KAPPA " + float_to_string_with_full_precision(norm_info.kappa())));
+    build_opts.emplace(("#define RADIUS " + support::cpp11::to_string(norm_info.norm_size() / 2)));
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalization_layer", build_opts));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCNormalizationLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _squared_input, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000..21e967a
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp

@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+using namespace arm_compute;
+
+GCPixelWiseMultiplicationKernel::GCPixelWiseMultiplicationKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+        set_format_if_unknown(*output->info(), Format::F32);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    std::string data_type;
+    std::string compute_type;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    build_opts.emplace("#define SCALE " + support::cpp11::to_string(scale));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pixelwise_mul_float", build_opts));
+
+    // Configure kernel window
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+                                                       input2->info()->valid_region());
+    output_access.set_valid_region(win, valid_region);
+
+    IGCKernel::configure(win);
+}
+
+void GCPixelWiseMultiplicationKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1;
+        add_3D_tensor_argument(idx, _input1, binding++, slice);
+        add_3D_tensor_argument(idx, _input2, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
new file mode 100644
index 0000000..0b6ba58
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCPoolingLayerKernel::GCPoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+{
+}
+
+BorderSize GCPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
+{
+    int                 pool_pad_x        = 0;
+    int                 pool_pad_y        = 0;
+    int                 pool_stride_x     = 0;
+    int                 pool_stride_y     = 0;
+    unsigned int        pooled_w          = 0;
+    unsigned int        pooled_h          = 0;
+    const PoolingType   pool_type         = pool_info.pool_type();
+    int                 pool_size         = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
+    const bool          is_global_pooling = pool_info.is_global_pooling();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
+    ARM_COMPUTE_ERROR_ON(is_global_pooling && (input->info()->tensor_shape().x() != input->info()->tensor_shape().y()));
+
+    // Update pool size in case of global pooling
+    pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pool_info.pad_stride_info());
+
+    // Output auto initialization if not yet initialized
+    {
+        TensorShape output_shape{ input->info()->tensor_shape() };
+        output_shape.set(0, pooled_w);
+        output_shape.set(1, pooled_h);
+
+        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+    const int input_width  = input->info()->dimension(0);
+    const int input_height = input->info()->dimension(1);
+
+    // Set instance variables
+    _input       = input;
+    _output      = output;
+    _pool_info   = pool_info;
+    _border_size = BorderSize(pool_pad_y, pool_pad_x);
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    if(input->info()->data_type() == DataType::F32)
+    {
+        build_opts.insert("#define DATA_TYPE_FP32");
+    }
+    else
+    {
+        build_opts.insert("#define DATA_TYPE_FP16");
+    }
+    build_opts.emplace(("#define POOL_" + string_from_pooling_type(pool_type)));
+    build_opts.emplace(("#define STRIDE_X " + support::cpp11::to_string(pool_stride_x)));
+    build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
+    build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
+    build_opts.emplace(("#define STRIDE_Y " + support::cpp11::to_string(pool_stride_y)));
+    build_opts.emplace(("#define PAD_X " + support::cpp11::to_string(pool_pad_x)));
+    build_opts.emplace(("#define PAD_Y " + support::cpp11::to_string(pool_pad_y)));
+
+    // Create kernel
+    if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
+    {
+        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
+        // each thread computes 4 output elements
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+
+        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+
+        if(input->info()->data_type() == DataType::F32)
+        {
+            if(is_pool3x3_stride_le3)
+            {
+                // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+                _num_elems_processed_per_iteration = 4;
+                num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
+            }
+        }
+        else
+        {
+            num_elements_read_per_iteration = pool_size;
+            if(is_pool3x3_stride_le3)
+            {
+                _num_elems_processed_per_iteration = 4;
+            }
+            else
+            {
+                _num_elems_processed_per_iteration = 2;
+            }
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
+        if(is_pool3x3_stride_le3)
+        {
+            build_opts.insert("#define POOLING_LAYER_3_OPTIMIZED");
+            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
+        }
+        else
+        {
+            build_opts.insert("#define POOLING_LAYER_" + support::cpp11::to_string(pool_size));
+            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
+        }
+    }
+    else // Run general case
+    {
+        if(input->info()->data_type() == DataType::F32)
+        {
+            _num_elems_processed_per_iteration = 1;
+        }
+        else
+        {
+            _num_elems_processed_per_iteration = 2;
+        }
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+        build_opts.emplace(("#define POOL_SIZE " + support::cpp11::to_string(pool_size)));
+
+        build_opts.insert("#define POOLING_LAYER_N");
+        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pooling_layer_n", build_opts));
+    }
+
+    Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+
+    if(input->info()->data_type() == DataType::F32)
+    {
+        AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+        AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+    else
+    {
+        // Calculate output right and bottom border
+        const int output_width          = output->info()->dimension(0);
+        const int output_height         = output->info()->dimension(1);
+        const int output_padding_right  = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
+        const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
+        const int input_padding_right   = ceil_to_multiple(input_width + 2 * _border_size.right, _num_elems_processed_per_iteration) - (input_width + 2 * _border_size.right);
+        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * _border_size.bottom, 1) - (input_height + 2 * _border_size.bottom);
+
+        // Configure kernel window
+        AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right + input_padding_right, input_height + _border_size.bottom + input_padding_bottom);
+        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+        update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    }
+
+    IGCKernel::configure(win);
+}
+
+void GCPoolingLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+    _kernel.use();
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    do
+    {
+        // Upsample input by pool size
+        Window in_slice(slice);
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, in_slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000..040a663
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp

@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Softmax across the x dimension
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.set(0, 1);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input  = input;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_MAX");
+
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
+    {
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.set_argument(idx++, input->info()->dimension(0));
+
+    // Configure kernel window
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
+    unsigned int       num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+GCLogits1DShiftExpSumKernel::GCLogits1DShiftExpSumKernel()
+    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+
+    _input  = input;
+    _max    = max;
+    _output = output;
+    _sum    = sum;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM");
+
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
+    {
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+    // Set fixed arguments
+    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
+    _kernel.set_argument(idx++, input->info()->dimension(0));
+
+    // Configure window
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
+    unsigned int       num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max->info(), 0, num_elems_written_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_written_per_iteration);
+
+    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCLogits1DShiftExpSumKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _max, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+        add_3D_tensor_argument(idx, _sum, binding++, slice);
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}
+
+GCLogits1DNormKernel::GCLogits1DNormKernel()
+    : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+
+    _input  = input;
+    _sum    = sum;
+    _output = output;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.insert("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.insert("#define SOFTMAX_LAYER_NORM");
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+    // Configure window
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    unsigned int           num_elems_written_per_iteration   = 1;
+    if(input->info()->data_type() == DataType::F16)
+    {
+        num_elems_written_per_iteration = 2;
+    }
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum->info(), 0, 0, num_elems_written_per_iteration, sum->info()->dimension(1));
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, sum_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCLogits1DNormKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    _kernel.use();
+
+    do
+    {
+        Window sum_slice = slice;
+        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        // Set inputs
+        add_3D_tensor_argument(idx, _input, binding++, slice);
+        add_3D_tensor_argument(idx, _sum, binding++, slice);
+        add_3D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window_collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
new file mode 100644
index 0000000..621c969
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp

@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape  output_shape{ input->info()->tensor_shape() };
+    const size_t w_out = input->info()->dimension(1);
+    const size_t h_out = input->info()->dimension(0);
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 4;
+
+    if(input->info()->data_type() == DataType::F16)
+    {
+#define TRANSPOSE_8X8
+
+#if defined(TRANSPOSE_4X4)
+        build_opts.emplace(("#define TRANSPOSE_4X4"));
+        num_elems_processed_per_iteration = 4;
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+        if(w_out != h_out)
+        {
+            build_opts.emplace("#define TRANSPOSE_8X8");
+            num_elems_processed_per_iteration = 8;
+        }
+        else
+        {
+            build_opts.emplace("#define TRANSPOSE_8X8_SQUARE");
+            num_elems_processed_per_iteration = 8;
+        }
+#endif                       /* TRANSPOSE_4X4 */
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
+
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCTransposeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        if(_input->info()->data_type() == DataType::F32)
+        {
+            add_2D_tensor_argument(idx, _input, 1, slice);
+            add_2D_tensor_argument(idx, _output, 2, slice);
+        }
+        else if(_input->info()->data_type() == DataType::F16)
+        {
+#if defined(TRANSPOSE_4X4)
+            BufferParam param = { 1, 3 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
+            BufferParam param = { 1, 4 };
+            add_2D_tensor_argument(idx, _input, param, slice);
+            param.binding_point = 2;
+            add_2D_tensor_argument(idx, _output, param, slice);
+#endif                       /* TRANSPOSE_4X4 */
+        }
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index fc0b6e9..151d7de 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -106,6 +106,13 @@
         ++n;
     }
 
+    if(tensor_shape.num_dimensions() > 2)
+    {
+        window.set(2, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n]), steps[2]));
+
+        ++n;
+    }
+
     for(; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 4a54675..b65c4f4 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -75,10 +75,11 @@
 {
     ARM_COMPUTE_ERROR_ON(this->buffer() == nullptr);
 
-    const DataType    dt       = this->info()->data_type();
-    const size_t      slices2D = this->info()->tensor_shape().total_size_upper(2);
-    const Strides     strides  = this->info()->strides_in_bytes();
-    const PaddingSize padding  = this->info()->padding();
+    const DataType    dt           = this->info()->data_type();
+    const size_t      slices2D     = this->info()->tensor_shape().total_size_upper(2);
+    const Strides     strides      = this->info()->strides_in_bytes();
+    const PaddingSize padding      = this->info()->padding();
+    const size_t      num_channels = this->info()->num_channels();
 
     // Set precision
     if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
@@ -117,6 +118,8 @@
             break;
     }
 
+    print_width = print_width * num_channels;
+
     // Set pointer to start
     const uint8_t *ptr = this->buffer() + start_offset;
 

diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index deafabe..dae0800 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp

@@ -41,7 +41,7 @@
 /* Max S16 value used for saturation purposes. */
 const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 namespace fp16
 {
 inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
@@ -132,7 +132,7 @@
     },
     input, accum);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 namespace
 {

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 67fc45b..9670b77 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -26,8 +26,10 @@
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/QAsymm8.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -39,6 +41,51 @@
 #include <map>
 
 using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    Window                 win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool                   window_changed                    = false;
+
+    if(output != nullptr && (output->total_size() != 0))
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+        window_changed = update_window_and_padding(win,
+                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration),
+                                                   output_access);
+
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        // In-place computation
+        window_changed = update_window_and_padding(win,
+                                                   AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 NEActivationLayerKernel::NEActivationLayerKernel()
     : _input(nullptr), _output(nullptr), _func(nullptr), _act_info(ActivationFunction::LOGISTIC)
@@ -47,7 +94,7 @@
 
 void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _input    = input;
     _act_info = activation_info;
@@ -56,15 +103,15 @@
     if(output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
+        auto_init_if_empty(*output->info(), *input->info()->clone());
         _output = output;
     }
 
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
+
+    ARM_COMPUTE_ERROR_ON_MSG((input->info()->data_type() == DataType::QASYMM8) && (activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU),
+                             "For QASYMM8 only lower/upper bounded relu is supported");
+
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
     {
@@ -81,7 +128,7 @@
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float> },
     };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // Activation functions : FP16
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f16 =
     {
@@ -96,7 +143,7 @@
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, float16_t> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, float16_t> },
     };
-#endif /* ARM_COMPUTE_ENABLE_FP16*/
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
 
     // Activation functions : QS8
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
@@ -128,9 +175,17 @@
         { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
         { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
     };
+    // Activation functions : QASYMM8
+    static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
+    {
+        { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qasymm8_t> },
+    };
 
     switch(input->info()->data_type())
     {
+        case DataType::QASYMM8:
+            _func = act_map_qasymm8[activation_info.activation()];
+            break;
         case DataType::QS8:
             _func = act_map_qs8[activation_info.activation()];
             break;
@@ -140,41 +195,22 @@
         case DataType::F32:
             _func = act_map_f32[activation_info.activation()];
             break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             _func = act_map_f16[activation_info.activation()];
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win,
-                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                                  output_access);
-
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        // In-place computation
-        update_window_and_padding(win,
-                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    }
-
-    ICPPKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, float16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
@@ -305,7 +341,7 @@
     },
     input, output);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
@@ -523,6 +559,47 @@
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
+typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+{
+    Iterator               input(_input, window);
+    Iterator               output(_output, window);
+    const QuantizationInfo qi_in  = _input->info()->quantization_info();
+    const QuantizationInfo qi_out = _output->info()->quantization_info();
+    const qasymm8x16_t     a      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.a(), qi_in.scale, qi_in.offset));
+    const qasymm8x16_t     b      = vdupq_n_u8(sqcvt_qasymm8_f32(_act_info.b(), qi_in.scale, qi_in.offset));
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+        const qasymm8x16_t in  = vld1q_u8(input_ptr);
+        qasymm8x16_t       tmp = {};
+
+        switch(F)
+        {
+            case ActivationFunction::LU_BOUNDED_RELU:
+                // Perform activation
+                tmp = vminq_u8(a, vmaxq_u8(b, in));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Function not implemented");
+                break;
+        }
+
+        vst1q_u8(output_ptr, tmp);
+    },
+    input, output);
+}
+
+template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
 {
     Iterator  input(_input, window);
@@ -653,6 +730,15 @@
     input, output);
 }
 
+Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+
+    return Status{};
+}
+
 void NEActivationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index f263fd0..8a98cf7 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -145,7 +145,7 @@
     return res;
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 inline float16x8x2_t vadd2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
 {
     const float16x8x2_t res =
@@ -158,11 +158,11 @@
 
     return res;
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 void add_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input1(in1, window);
     Iterator input2(in2, window);
     Iterator output(out, window);
@@ -175,13 +175,13 @@
         vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vadd2q_f16(a, b));
     },
     input1, input2, output);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(in1);
     ARM_COMPUTE_UNUSED(in2);
     ARM_COMPUTE_UNUSED(out);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -355,6 +355,57 @@
     },
     input1, input2, output);
 }
+
+inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+
+    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16)
+        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
+        && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
+        "You called addition with the wrong image formats");
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
+                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
+                                                    output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
@@ -384,17 +435,7 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
 
     static std::map<std::string, AddFunction *> map_function =
     {
@@ -416,7 +457,6 @@
         { "add_saturate_F32_F32_F32", &add_F32_F32_F32 },
         { "add_wrap_F16_F16_F16", &add_F16_F16_F16 },
         { "add_saturate_F16_F16_F16", &add_F16_F16_F16 },
-
     };
 
     _input1 = input1;
@@ -435,28 +475,19 @@
     {
         _func = it->second;
     }
-    else
-    {
-        ARM_COMPUTE_ERROR("You called arithmetic addition with the wrong tensor data type");
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 85f72c1..3db8028 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp

@@ -157,7 +157,7 @@
     input1, input2, output);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 inline float16x8x2_t vsub2q_f16(const float16x8x2_t &a, const float16x8x2_t &b)
 {
     const float16x8x2_t res =
@@ -170,11 +170,11 @@
 
     return res;
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input1(in1, window);
     Iterator input2(in2, window);
     Iterator output(out, window);
@@ -187,13 +187,13 @@
         vst2q_f16(reinterpret_cast<float16_t *>(output.ptr()), vsub2q_f16(a, b));
     },
     input1, input2, output);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(in1);
     ARM_COMPUTE_UNUSED(in2);
     ARM_COMPUTE_UNUSED(out);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -348,6 +348,57 @@
     },
     input1, input2, output);
 }
+
+inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+
+    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16)
+        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
+        && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
+        && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
+        "You called subtract with the wrong image formats");
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
+                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
+                                                    output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
@@ -377,19 +428,9 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
 
-    static std::map<std::string, SubFunction *> map_function =
+    static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
     {
         { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
         { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
@@ -409,7 +450,6 @@
         { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
         { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
         { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
-
     };
 
     _input1 = input1;
@@ -428,28 +468,19 @@
     {
         _func = it->second;
     }
-    else
-    {
-        ARM_COMPUTE_ERROR("You called subtract with the wrong image formats");
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index f6f6f9c..f5144c6 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -33,9 +33,39 @@
 
 using namespace arm_compute;
 
-NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+
+    if(nullptr != output)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
 }
 
 void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
@@ -169,7 +199,7 @@
     input, output);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
 {
     Iterator input(in, window);
@@ -212,11 +242,29 @@
     },
     input, output);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+} // namespace
+
+NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
+{
+}
 
 void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
+
+    ITensorInfo *output_info = nullptr;
+
+    if(nullptr != output)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), *input->info());
+
+        output_info = output->info();
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info, mean->info(), var->info(), beta->info(), gamma->info(), epsilon));
 
     _input   = input;
     _output  = input;
@@ -228,59 +276,44 @@
 
     if(output != nullptr)
     {
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
         _output = output;
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
     switch(input->info()->data_type())
     {
         case DataType::QS8:
-            _func                             = &batch_normalization_q8;
-            num_elems_processed_per_iteration = 16;
+            _func = &batch_normalization_q8;
             break;
         case DataType::QS16:
-            _func                             = &batch_normalization_q16;
-            num_elems_processed_per_iteration = 8;
+            _func = &batch_normalization_q16;
             break;
         case DataType::F32:
-            _func                             = &batch_normalization_fp32;
-            num_elems_processed_per_iteration = 4;
+            _func = &batch_normalization_fp32;
             break;
         case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
-            _func                             = &batch_normalization_fp16;
-            num_elems_processed_per_iteration = 8;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            _func = &batch_normalization_fp16;
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
             break;
     }
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        update_window_and_padding(win, input_access);
-    }
-    INEKernel::configure(win);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta,
+                                                 const ITensorInfo *gamma,
+                                                 float              epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr).first);
+
+    return Status{};
 }
 
 void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index d7178e4..0c97005 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp

@@ -33,7 +33,7 @@
 
 using namespace arm_compute;
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -104,7 +104,7 @@
     },
     input, output);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 BorderSize NEBox3x3Kernel::border_size() const
 {

diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index bcbe790..9dfd580 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp

@@ -51,7 +51,7 @@
 constexpr int MAYBE   = 127;
 } // namespace
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 namespace fp16
 {
 inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
@@ -787,7 +787,7 @@
 
     INEKernel::configure(win);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 namespace
 {

diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index a2b24de..4d76dfe 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp

@@ -122,14 +122,6 @@
             break;
     }
 
-    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
-    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
-    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
-    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
-
     Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
 
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -256,14 +248,14 @@
     const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
 
     Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
-    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
-    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[0], 1.f / _y_subsampling[0]);
+    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, y_step, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
     AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
 
     update_window_and_padding(win,
-                              AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
-                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
-                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
+                              AccessWindowRectangle(plane0->info(), 0, 0, _num_elems_processed_per_iteration, y_step),
+                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
+                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
                               output_plane0_access,
                               output_plane1_access,
                               output_plane2_access);
@@ -358,7 +350,7 @@
 {
     // Create sub-sampled uv window and init uv planes
     Window win_uv(win);
-    win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
+    win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
     win_uv.validate();
 
     Iterator p0(_planes[0], win);
@@ -405,13 +397,13 @@
 
     // Update UV window
     Window uv_win(win);
-    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
+    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
     uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
     uv_win.validate();
 
     // Update output win
     Window out_win(win);
-    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
+    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
     out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
     out_win.validate();
 
@@ -421,6 +413,9 @@
     Iterator  p2(_planes[2 - shift], uv_win);
     Iterator  out(_output_multi->plane(1), out_win);
 
+    // Increase step size after iterator is created to calculate stride correctly for multi channel format
+    out_win.set_dimension_step(Window::DimX, out_win.x().step() * 2);
+
     execute_window_loop(out_win, [&](const Coordinates & id)
     {
         const uint8x8x2_t pixels =
@@ -450,19 +445,17 @@
 
     // Update window
     Window tmp_win(win);
-    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
+    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
     tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
-    tmp_win.validate();
 
     Iterator in(_planes[plane_id], tmp_win);
     Iterator out(_output_multi->plane(plane_id), tmp_win);
 
     execute_window_loop(tmp_win, [&](const Coordinates & id)
     {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
+        const uint8x8_t pixels = vld1_u8(in.ptr());
 
-        vst1_u8(out_ptr, vld1_u8(in_ptr));
+        vst1_u8(out.ptr(), pixels);
     },
     in, out);
 }

diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 460d37e..9fda65f 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp

@@ -36,6 +36,37 @@
 
 using namespace arm_compute;
 
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_dims)
+{
+    TensorShape output_shape = input->tensor_shape();
+    output_shape.set(0, convolved_dims.width);
+    output_shape.set(1, convolved_dims.height);
+    output_shape.set(2, input->tensor_shape()[0]);
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::QS16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    // Validate configured output
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
 template <typename T>
 void NECol2ImKernel::run_col2im(const Window &window)
 {
@@ -55,7 +86,7 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x;
+        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
 
         *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
     },
@@ -67,23 +98,15 @@
 {
 }
 
-void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+void NECol2ImKernel::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, convolved_dims.first);
-    output_shape.set(1, convolved_dims.second);
-    output_shape.set(2, input->info()->tensor_shape()[0]);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), convolved_dims)));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
 
     _input          = input;
     _output         = output;
@@ -116,6 +139,12 @@
     INEKernel::configure(win);
 }
 
+Status NECol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+    return Status{};
+}
+
 void NECol2ImKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index b65f3ba..ca22af0 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp

@@ -70,6 +70,7 @@
 void NECumulativeDistributionKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr);

diff --git a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp b/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp
new file mode 100644
index 0000000..71db2e9
--- /dev/null
+++ b/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp

@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+NEDeconvolutionLayerUpsampleKernel::NEDeconvolutionLayerUpsampleKernel()
+    : _offsets(nullptr), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize NEDeconvolutionLayerUpsampleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void NEDeconvolutionLayerUpsampleKernel::configure(const ITensor *input, const ITensor *offsets, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    _input   = input;
+    _output  = output;
+    _offsets = offsets;
+
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const int              border_offset                     = border_size().left;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle  input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+    AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, offsets_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDeconvolutionLayerUpsampleKernel::scale_nearest(const Window &window)
+{
+    const size_t input_stride = _input->info()->strides_in_bytes()[1];
+
+    // Compute the ratio between source height and destination height
+    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Window win_off;
+    win_off.set(Window::DimX, window[Window::DimX]);
+    win_off.set(Window::DimY, window[Window::DimY]);
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+    Iterator offsets(_offsets, win_off);
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+        {
+            float32x4x4_t tmp =
+            {
+                {
+                    vdupq_n_f32(0),
+                    vdupq_n_f32(0)
+                }
+            };
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
+
+                const size_t in_yi      = (id.y() + 0.5f) * hr;
+                const size_t offset_row = in_yi * input_stride;
+
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
+                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
+
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
+                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
+
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
+                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
+
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
+                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
+
+                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
+            },
+            in, offsets, out);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+}
+
+void NEDeconvolutionLayerUpsampleKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    scale_nearest(window);
+}

diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
similarity index 93%
rename from src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
rename to src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 7a62b0c..01b0f10 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -95,17 +95,17 @@
 }
 } // namespace
 
-NEDepthConcatenateKernel::NEDepthConcatenateKernel()
+NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel()
     : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0)
 {
 }
 
-BorderSize NEDepthConcatenateKernel::border_size() const
+BorderSize NEDepthConcatenateLayerKernel::border_size() const
 {
     return BorderSize(_top_bottom, _left_right);
 }
 
-void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -159,7 +159,7 @@
     INEKernel::configure(win);
 }
 
-void NEDepthConcatenateKernel::run(const Window &window, const ThreadInfo &info)
+void NEDepthConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
similarity index 98%
rename from src/core/NEON/kernels/NEDepthConvertKernel.cpp
rename to src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index d97a20b..c29cb57 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -39,12 +39,12 @@
 class Coordinates;
 } // namespace arm_compute
 
-NEDepthConvertKernel::NEDepthConvertKernel()
+NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
     : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
 {
 }
 
-void NEDepthConvertKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
 
@@ -120,7 +120,7 @@
     ICPPKernel::configure(win);
 }
 
-void NEDepthConvertKernel::run(const Window &window, const ThreadInfo &info)
+void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
new file mode 100644
index 0000000..dd5c448
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
+#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+using namespace arm_compute::detail;
+
+NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _conv_info()
+{
+}
+
+BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
+                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
+                                                                              conv_info);
+
+    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
+    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
+
+    _input                           = input;
+    _output                          = output;
+    _weights                         = weights;
+    _conv_info                       = conv_info;
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    const unsigned int conv_stride_y = conv_info.stride().second;
+    const unsigned int conv_pad_x    = conv_info.pad().first;
+    const unsigned int conv_pad_y    = conv_info.pad().second;
+
+    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
+
+    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+    _border_size                                       = BorderSize(conv_pad_y, conv_pad_x);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration));
+
+    const unsigned int num_x_steps               = (expected_output.first + num_elems_written_per_iteration - 1) / num_elems_written_per_iteration;
+    const int          input_num_elems_processed = get_input_num_elems_processed(num_elems_written_per_iteration, conv_stride_x);
+
+    AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, (num_x_steps - 1) * input_num_elems_processed + 12, conv_stride_y * (expected_output.second - 1) + 2);
+    AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+    AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * num_elems_written_per_iteration, expected_output.second);
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+template <unsigned int stridex>
+class convolver_3x3
+{
+public:
+    static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+    {
+        const int          input_stride_x  = input->info()->strides_in_bytes().x();
+        const int          input_stride_y  = input->info()->strides_in_bytes().y();
+        const int          output_stride_y = output->info()->strides_in_bytes().y();
+        const int          kernel_stride_y = weights->info()->strides_in_bytes().y();
+        const int          kernel_stride_z = weights->info()->strides_in_bytes().z();
+        const int          output_w        = output->info()->dimension(0);
+        const int          output_h        = output->info()->dimension(1);
+        const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+        const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
+        const unsigned int conv_pad_x      = std::get<0>(conv_info.pad());
+        const unsigned int conv_pad_y      = std::get<1>(conv_info.pad());
+
+        // setup output window for the iterator
+        Window window_out = window;
+        window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
+        window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
+
+        // setup input window for the iterator
+        Window window_in = window;
+        // we just want execute_window_loop to iterate over the dimensions > 2, so we set the first 2 dimensions to 0
+        window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+        window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window window_k = calculate_max_window(*weights->info(), Steps(1u));
+
+        Iterator in(input, window_in);
+        Iterator out(output, window_out);
+        Iterator w(weights, window_k);
+
+        const uint8_t *weights_ptr = w.ptr();
+
+        execute_window_loop(window_out, [&](const Coordinates & id)
+        {
+            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            int            ih        = 0;
+            int            oh        = 0;
+
+            const uint8_t      *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
+            const auto          ptr_weights_r0   = reinterpret_cast<const float *>(ptr_weights_base);
+            const auto          ptr_weights_r1   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
+            const auto          ptr_weights_r2   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
+            const float32x4x3_t vw_r0            = load_matrix_row(ptr_weights_r0);
+            const float32x4x3_t vw_r1            = load_matrix_row(ptr_weights_r1);
+            const float32x4x3_t vw_r2            = load_matrix_row(ptr_weights_r2);
+
+            for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
+            {
+                auto in_top = reinterpret_cast<const float *>(input_ptr + (ih + 0) * input_stride_y);
+                auto in_mid = reinterpret_cast<const float *>(input_ptr + (ih + 1) * input_stride_y);
+                auto in_low = reinterpret_cast<const float *>(input_ptr + (ih + 2) * input_stride_y);
+                auto p_out  = reinterpret_cast<float *>(out.ptr() + oh * output_stride_y);
+
+                for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
+                    in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                {
+                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0);
+                    store_results<stridex>(p_out, vres);
+                }
+            }
+        },
+        in, out);
+    }
+};
+
+void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_UNUSED(info);
+
+    const unsigned int conv_stride_x                   = _conv_info.stride().first;
+    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        case 2:
+            convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        case 3:
+            convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
new file mode 100644
index 0000000..2ceb39d
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
+    : _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+{
+}
+
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+    _input       = input;
+    _output      = output;
+    _kernel_dims = kernel_dims;
+    _conv_info   = conv_info;
+    _has_bias    = has_bias;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+    //const int kernel_depth   = _input->info()->dimension(2);
+    const int input_w        = _input->info()->dimension(0);
+    const int input_h        = _input->info()->dimension(1);
+    const int input_stride_x = _input->info()->strides_in_bytes().x();
+    const int input_stride_y = _input->info()->strides_in_bytes().y();
+    const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const int stride_x       = _conv_info.stride().first;
+    const int stride_y       = _conv_info.stride().second;
+
+    const int pad_left  = _conv_info.pad_left();
+    const int pad_right = _conv_info.pad_right();
+    const int pad_top   = _conv_info.pad_top();
+
+    Window window_in(window);
+    // The first three dimensions of the input are increased by the inner loops
+    window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output window
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->dimension(0)));
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+    window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 1));
+
+    Iterator in(_input, window_in);
+    Iterator out(_output, window_out);
+
+    const int full_length   = input_w + pad_left + pad_right;
+    const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const int src_pixel_linear = id.y() * stride_x;
+
+        const int src_x = -pad_left + src_pixel_linear % max_initial_x;
+        const int src_y = -pad_top + src_pixel_linear / max_initial_x * stride_y;
+
+        // Get pointers
+        const uint8_t *const input_ptr  = in.ptr() + id.z() * input_stride_z;
+        auto                 output_ptr = reinterpret_cast<float *>(out.ptr());
+        const int            height     = src_y + _kernel_dims.height;
+        const int            width      = src_x + _kernel_dims.width;
+
+        for(int y = src_y; y < height; ++y)
+        {
+            for(int x = src_x; x < width; ++x, ++output_ptr)
+            {
+                if(x < 0 || x >= input_w || y < 0 || y >= input_h)
+                {
+                    *output_ptr = 0;
+                }
+                else
+                {
+                    *output_ptr = *(reinterpret_cast<const float *>(input_ptr + x * input_stride_x + y * input_stride_y));
+                }
+            }
+        }
+
+        if(_has_bias)
+        {
+            *output_ptr = static_cast<float>(1);
+        }
+    },
+    in, out);
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
new file mode 100644
index 0000000..9b36df3
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
+    : _input(nullptr), _output(nullptr), _conv_dims()
+{
+}
+
+void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, conv_w);
+    output_shape.set(1, conv_h);
+    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    _input     = input;
+    _output    = output;
+    _conv_dims = std::pair<size_t, size_t>(conv_w, conv_h);
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The NEDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+    // const int input_w         = _input->info()->dimension(0);
+    const int output_stride_x = _output->info()->strides_in_bytes().x();
+    const int output_stride_y = _output->info()->strides_in_bytes().y();
+    const int output_stride_z = _output->info()->strides_in_bytes().z();
+
+    // Setup output window
+    Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, window);
+    Iterator out(_output, window_out);
+
+    const int patch_size = _conv_dims.first * _conv_dims.second;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int z       = id.x() / patch_size;
+        const int index2D = id.x() - z * patch_size;
+
+        auto input_ptr  = reinterpret_cast<float *>(in.ptr());
+        auto output_ptr = reinterpret_cast<float *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z);
+
+        *output_ptr = *input_ptr;
+    },
+    in, out);
+}

diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..6585fdb
--- /dev/null
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
+    : _input(nullptr), _output(nullptr), _biases(nullptr)
+{
+}
+
+void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _input  = input;
+    _output = output;
+    _biases = biases;
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    // The NEDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEDepthwiseWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+    const int input_w         = _input->info()->dimension(0);
+    const int output_stride_x = _output->info()->strides_in_bytes().x();
+    const int output_stride_y = _output->info()->strides_in_bytes().y();
+
+    Window window_in(window);
+    // The first three dimensions of the input are increased by the inner loops
+    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+    window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+    window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
+
+    // Setup output window
+    Window window_out;
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, window_in);
+    Iterator out(_output, window_out);
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        auto input_ptr  = reinterpret_cast<float *>(in.ptr());
+        auto output_ptr = reinterpret_cast<float *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y);
+
+        for(int i = 0; i < input_w; ++i, ++input_ptr)
+        {
+            *(output_ptr + i) = *input_ptr;
+        }
+
+        if(_biases != nullptr)
+        {
+            *(output_ptr + input_w) = *(reinterpret_cast<float *>(_biases->ptr_to_element(Coordinates(id.z()))));
+        }
+    },
+    in, out);
+}

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index 6631359..65b7087 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp

@@ -40,6 +40,62 @@
 
 namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+    if(is_data_type_quantized(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(bias, output);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    bool               window_changed                    = false;
+    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+    if(output != nullptr && (output->total_size() != 0))
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, bias_access);
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
 // Internal load
 inline float32x4_t internal_vld1q(const float *in)
 {
@@ -124,7 +180,7 @@
     return vqaddq_qs32(x, y);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 inline float16x8_t internal_vld1q(const float16_t *in)
 {
     return vld1q_f16(in);
@@ -141,7 +197,7 @@
 {
     return vaddq_f16(x, y);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T1, typename T2, bool in_place>
 void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
@@ -186,40 +242,27 @@
 
 void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, bias);
+
+    // Auto-initialize output output if required
     if(output != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output);
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), *input->info());
     }
-    ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info()));
 
     _func   = nullptr;
     _bias   = bias;
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
-
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1));
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access, bias_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    }
-    else
-    {
-        update_window_and_padding(win, input_access, bias_access);
-        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
-    }
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 
     // Set appropriate function
     switch(input->info()->data_type())
@@ -246,13 +289,13 @@
             _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
             break;
         }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
             _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
             break;
         }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
         {
             _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
@@ -266,6 +309,14 @@
     }
 }
 
+Status NEDirectConvolutionLayerBiasAccumulateKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
+
+    return Status{};
+}
+
 void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 66d6d1f..2ba0ef2 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
@@ -37,6 +38,7 @@
 #include <arm_neon.h>
 
 using namespace arm_compute;
+using namespace arm_compute::detail;
 
 namespace
 {
@@ -68,7 +70,7 @@
     return vdupq_n_qs16(v);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <unsigned int stridex>
 float16x8_t internal_vld1q(const float16_t *in);
 
@@ -113,7 +115,7 @@
     ARM_COMPUTE_UNUSED(fixed_point_position);
     return vaddq_f16(x, vmulq_f16(y, z));
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <unsigned int stridex>
 float32x4_t internal_vld1q(const float *in);
@@ -249,10 +251,15 @@
     return r;
 }
 
-constexpr int SmallTensorSizeOptim = 8;
+constexpr int small_tensor_size_optim = 8;
+inline bool run_optim_small_tensor_info(const ITensorInfo *t)
+{
+    return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
+}
+
 inline bool run_optim_small_tensor(const ITensor *t)
 {
-    return t->info()->dimension(Window::DimX) <= SmallTensorSizeOptim && t->info()->dimension(Window::DimY) <= SmallTensorSizeOptim;
+    return run_optim_small_tensor_info(t->info());
 }
 
 // Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
@@ -264,8 +271,8 @@
 public:
     static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > SmallTensorSizeOptim);
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > SmallTensorSizeOptim);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim);
 
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
         const int          input_stride_z  = input->info()->strides_in_bytes().z();
@@ -300,12 +307,12 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr                    = in.ptr();
-            uint8_t       *out_ptr                      = out.ptr();
-            int            ih                           = 0;
-            int            oh                           = 0;
-            float32x4_t    accum0[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
-            float32x4_t    accum1[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            const uint8_t *input_ptr                       = in.ptr();
+            uint8_t       *out_ptr                         = out.ptr();
+            int            ih                              = 0;
+            int            oh                              = 0;
+            float32x4_t    accum0[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
+            float32x4_t    accum1[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
             for(int oz = 0; oz < range_z; ++oz)
             {
                 accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
@@ -427,123 +434,7 @@
     }
 };
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
-inline float16x8x3_t load_matrix_row(const float16_t *ptr)
-{
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
-    return r;
-}
-
-template <unsigned int stridex>
-float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                           int fixed_point_position);
-
-template <>
-float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                              int fixed_point_position)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + 8),
-            vld1q_f16(in_top + 16)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + 8),
-            vld1q_f16(in_mid + 16)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + 8),
-            vld1q_f16(in_low + 16)
-        }
-    };
-    float16x8x2_t out =
-    {
-        {
-            vmulq_f16(vtop.val[0], m0.val[0]),
-            vmulq_f16(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
-    out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
-    out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
-    return out;
-}
-
-template <>
-inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
-{
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
-{
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <unsigned int stridex>
-void store_results(float16_t *buffer, const float16x8x2_t &values);
-
-template <>
-void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-    vst1q_f16(buffer + 8, values.val[1]);
-}
-
-template <>
-void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-}
-
-template <>
-void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1_f16(buffer, vget_low_f16(values.val[0]));
-}
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <unsigned int stridex>
 void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
@@ -567,34 +458,7 @@
     vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
 }
 
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
-
-inline float32x4x3_t load_matrix_row(const float *ptr)
-{
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
-    return r;
-}
-inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)
-{
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const qint8x8x3_t r =
-    {
-        {
-            vld1_dup_qs8(ptr),
-            vld1_dup_qs8(1 + ptr),
-            vld1_dup_qs8(2 + ptr)
-        }
-    };
-    return r;
-}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <unsigned int stridex>
 float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
@@ -750,213 +614,6 @@
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
-
-template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <unsigned int stridex>
-qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);
-
-template <>
-inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    const qint8x8x3_t vtop =
-    {
-        {
-            vld1_qs8(in_top),
-            vld1_qs8(in_top + 8),
-            vld1_qs8(in_top + 16)
-        }
-    };
-    const qint8x8x3_t vmid =
-    {
-        {
-            vld1_qs8(in_mid),
-            vld1_qs8(in_mid + 8),
-            vld1_qs8(in_mid + 16)
-        }
-    };
-    const qint8x8x3_t vlow =
-    {
-        {
-            vld1_qs8(in_low),
-            vld1_qs8(in_low + 8),
-            vld1_qs8(in_low + 16)
-        }
-    };
-    qint16x8x2_t out =
-    {
-        {
-            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
-            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
-        }
-    };
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
-{
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)
-{
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
-    return out;
-}
-
-template <unsigned int stridex>
-void store_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-void store_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-    vst1q_f32(buffer + 4, values.val[1]);
-}
-
-template <>
-void store_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-}
-
-template <>
-void store_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vget_low_f32(values.val[0]));
-}
-
-template <unsigned int stridex>
-void store_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-    vst1q_qs16(buffer + 8, values.val[1]);
-}
-
-template <>
-void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-}
-
-template <>
-void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1_qs16(buffer, vget_low_s16(values.val[0]));
-}
-
-template <unsigned int stridex>
 void accumulate_results(float *buffer, const float32x4x2_t &values);
 
 template <>
@@ -1000,27 +657,6 @@
     vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
 }
 
-template <unsigned int stridex>
-int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
-
-template <>
-int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration;
-}
-
-template <>
-int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration << 1;
-}
-
-template <>
-int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration * 3;
-}
-
 template <typename T1, typename T2, unsigned int stridex>
 class convolver_3x3
 {
@@ -1361,6 +997,145 @@
     }
 }
 
+inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+    std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info);
+
+    TensorShape output_shape = input->tensor_shape();
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+    output_shape.set(2, weights->dimension(3));
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
+                                    "Pad > 0 not supported for 1x1 weights");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
+                                    "Pad > 1 not supported for 3x3 weights");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),
+                                    "Pad > 2 not supported for 5x5 weights");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info);
+
+        DataType data_type = input->data_type();
+        if(is_data_type_fixed_point(data_type))
+        {
+            // Promote data type in case of fixed point
+            data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
+                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
+{
+    // Calculate right and bottom border
+    unsigned int       kernel_size   = weights->dimension(0);
+    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
+    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+    const int          input_width   = input->dimension(0);
+    const int          input_height  = input->dimension(1);
+
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            switch(input->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                case DataType::QS8:
+                case DataType::QS16:
+                    num_elems_written_per_iteration = 8;
+                    break;
+                case DataType::F32:
+                    if(run_optim_small_tensor_info(input))
+                    {
+                        num_elems_written_per_iteration = 8;
+                    }
+                    else
+                    {
+                        num_elems_written_per_iteration = 4;
+                    }
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported.");
+                    break;
+            }
+            num_weight_elems_read_per_row = kernel_size;
+            num_elems_read_per_iteration  = conv_stride_x * num_elems_written_per_iteration;
+            break;
+        }
+        case 3:
+        case 5:
+        {
+            switch(input->data_type())
+            {
+                case DataType::F32:
+                    num_weight_elems_read_per_row   = 4 + kernel_size - 1;
+                    num_elems_read_per_iteration    = 12;
+                    num_elems_written_per_iteration = 16 >> conv_stride_x;
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                case DataType::QS8:
+                case DataType::QS16:
+                    num_weight_elems_read_per_row   = 8 + kernel_size - 1;
+                    num_elems_read_per_iteration    = 24;
+                    num_elems_written_per_iteration = 32 >> conv_stride_x;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Data type not supported.");
+                    break;
+            }
+        }
+        break;
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not implemented");
+            break;
+        }
+    }
+
+    const int upper_bound_w    = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_pad_x - input_width;
+    const int upper_bound_h    = ((output->dimension(1) - 1) * conv_stride_y - conv_pad_y + kernel_size) - input_height;
+    border_size.right          = std::max(upper_bound_w, static_cast<int>(kernel_size));
+    border_size.bottom         = std::max(upper_bound_h, static_cast<int>(kernel_size));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+    AccessWindowStatic     input_access(input, -conv_pad_x, -conv_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
+    AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
@@ -1376,23 +1151,9 @@
 
 void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
-                             "Pad > 0 not supported for 1x1 weights");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
-                             "Pad > 1 not supported for 3x3 weights");
-    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),
-                             "Pad > 2 not supported for 5x5 weights");
-
-    ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
-    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
-    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+    const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
+    const unsigned int conv_pad_y = std::get<1>(conv_info.pad());
 
     _input       = input;
     _weights     = weights;
@@ -1401,17 +1162,8 @@
     _kernel_size = weights->info()->dimension(0);
     _border_size = BorderSize(conv_pad_y, conv_pad_x);
 
-    const unsigned int kernel_size = weights->info()->dimension(0);
-
     // Get convolved dimensions
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-    output_shape.set(2, weights->info()->dimension(3));
+    TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info);
 
     DataType data_type = input->info()->data_type();
 
@@ -1424,88 +1176,34 @@
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type());
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));
 
-    switch(_kernel_size)
-    {
-        case 1:
-        {
-            switch(input->info()->data_type())
-            {
-#ifdef ARM_COMPUTE_ENABLE_FP16
-                case DataType::F16:
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
-                case DataType::QS8:
-                case DataType::QS16:
-                    _num_elems_written_per_iteration = 8;
-                    break;
-                case DataType::F32:
-                    if(run_optim_small_tensor(input))
-                    {
-                        _num_elems_written_per_iteration = 8;
-                    }
-                    else
-                    {
-                        _num_elems_written_per_iteration = 4;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported.");
-                    break;
-            }
-            _num_weight_elems_read_per_row = kernel_size;
-            _num_elems_read_per_iteration  = conv_stride_x * _num_elems_written_per_iteration;
-            break;
-        }
-        case 3:
-        case 5:
-        {
-            switch(input->info()->data_type())
-            {
-                case DataType::F32:
-                    _num_weight_elems_read_per_row   = 4 + _kernel_size - 1;
-                    _num_elems_read_per_iteration    = 12;
-                    _num_elems_written_per_iteration = 16 >> conv_stride_x;
-                    break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
-                case DataType::F16:
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
-                case DataType::QS8:
-                case DataType::QS16:
-                    _num_weight_elems_read_per_row   = 8 + _kernel_size - 1;
-                    _num_elems_read_per_iteration    = 24;
-                    _num_elems_written_per_iteration = 32 >> conv_stride_x;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Data type not supported.");
-                    break;
-            }
-        }
-        break;
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not implemented");
-            break;
-        }
-    }
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,
+                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    // Calculate right and bottom border
-    const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());
-    const int          input_width   = input->info()->dimension(0);
-    const int          input_height  = input->info()->dimension(1);
-    const int          upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;
-    const int          upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;
-    _border_size.right               = std::max(upper_bound_w, static_cast<int>(_kernel_size));
-    _border_size.bottom              = std::max(upper_bound_h, static_cast<int>(_kernel_size));
-    Window                 win       = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
-    AccessWindowStatic     input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-    AccessWindowStatic     weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size);
-    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
-    update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    unsigned int num_weight_elems_read_per_row   = 0;
+    unsigned int num_elems_read_per_iteration    = 0;
+    unsigned int num_elems_written_per_iteration = 0;
+    BorderSize   border_size(conv_info.pad().first, conv_info.pad().second);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              weights->clone().get(),
+                                                              output->clone().get(),
+                                                              conv_info,
+                                                              num_weight_elems_read_per_row,
+                                                              num_elems_read_per_iteration,
+                                                              num_elems_written_per_iteration,
+                                                              border_size)
+                                .first);
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)
@@ -1532,11 +1230,11 @@
                 case DataType::F32:
                     convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
                     convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
                 default:
                     ARM_COMPUTE_ERROR("Data type not supported");
                     break;
@@ -1553,11 +1251,11 @@
                 case DataType::F32:
                     convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
                     convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
                     break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
                 default:
                     ARM_COMPUTE_ERROR("Data type not supported");
                     break;

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 593a529..af04955 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -47,8 +47,8 @@
     float border_value;
     constant_border_value.get(border_value);
     uint8_t *const start_valid_region = tensor->ptr_to_element(tensor->info()->valid_region().anchor);
-    const size_t &width              = tensor->info()->valid_region().shape[0];
-    const size_t &height             = tensor->info()->valid_region().shape[1];
+    const size_t   width              = tensor->info()->valid_region().shape[0];
+    const size_t   height             = tensor->info()->valid_region().shape[1];
     const int      stridey            = tensor->info()->strides_in_bytes()[1];
 
     // Left and right border
@@ -228,8 +228,8 @@
 void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
 {
     uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
-    const size_t &width              = _tensor->info()->valid_region().shape[0];
-    const size_t &height             = _tensor->info()->valid_region().shape[1];
+    const size_t   width              = _tensor->info()->valid_region().shape[0];
+    const size_t   height             = _tensor->info()->valid_region().shape[1];
 
     // Left and right border
     Window vertical(window);
@@ -287,8 +287,8 @@
     _constant_border_value.get(constant_border_value);
 
     uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
-    const size_t &width              = _tensor->info()->valid_region().shape[0];
-    const size_t &height             = _tensor->info()->valid_region().shape[1];
+    const size_t   width              = _tensor->info()->valid_region().shape[0];
+    const size_t   height             = _tensor->info()->valid_region().shape[1];
     const int      stridey            = _tensor->info()->strides_in_bytes()[1];
 
     // Left and right border

diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index ae5d456..2f8afd8 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp

@@ -40,6 +40,50 @@
 
 namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = input->tensor_shape();
+        output_shape.set(0, input->dimension(0) * 4);
+        output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    unsigned int           num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    bool                   window_changed                      = false;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
 void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
 {
     const size_t in_stride = input->info()->strides_in_bytes()[1];
@@ -132,9 +176,7 @@
 
 void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     TensorShape output_shape = input->info()->tensor_shape();
     output_shape.set(0, input->info()->dimension(0) * 4);
@@ -143,21 +185,16 @@
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    unsigned int           num_elems_processed_per_iteration_x = 4;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
     switch(input->info()->element_size())
     {
         case 1:
-            num_elems_processed_per_iteration_x = 8;
-            _func                               = &gemm_interleave_8bit_elements;
+            _func = &gemm_interleave_8bit_elements;
             break;
         case 2:
             _func = &gemm_interleave_16bit_elements;
@@ -171,15 +208,17 @@
     }
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    update_window_and_padding(win, output_access, input_access);
+Status NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
new file mode 100644
index 0000000..768dd8b
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp

@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height)
+{
+    TensorShape output_shape      = input->tensor_shape();
+    const float interleave_by_f32 = block_height;
+    output_shape.set(0, input->dimension(0) * interleave_by_f32);
+    output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32));
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+    const unsigned int num_elems_processed_per_iteration_x = block_width;
+    const unsigned int num_elems_processed_per_iteration_y = block_height;
+    bool               window_changed                      = false;
+
+    // Configure kernel window
+    Window      win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    const float scaley_factor = 1.f / block_height;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowRectangle output_access(output,
+                                            0, 0,
+                                            num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y,
+                                            1, num_elems_processed_per_iteration_y, scaley_factor);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    const unsigned int in_height = input->info()->dimension(1);
+    const unsigned int in_width  = input->info()->dimension(0);
+
+    const float scale_y_factor = 1.f / float(block_height);
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, scale_y_factor);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, block_width * block_height);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        std::fill_n(out.ptr(), block_width * block_height, 0);
+    },
+    out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        for(unsigned int z = id.y(); (z < in_width) && z < (id.y() + block_height); ++z)
+        {
+            int j = (z - id.y()) * block_width;
+            for(unsigned int b = id.x(); (b < in_height) && (b < (id.x() + block_width)); ++b)
+            {
+                *(out.ptr() + j++) = *(input->buffer() + b * in_stride + z);
+            }
+        }
+    },
+    in, out);
+}
+
+inline void gemm_interleave_blocked_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
+{
+    const size_t in_stride = input->info()->strides_in_bytes()[1];
+
+    const unsigned int in_height = input->info()->dimension(1);
+    const unsigned int in_width  = input->info()->dimension(0);
+
+    const float scale_y_factor = 1.f / float(block_height);
+
+    // Set window for output tensor
+    Window win_out(window);
+    win_out.scale(Window::DimY, scale_y_factor);
+    Iterator in(input, window);
+
+    win_out.set_dimension_step(Window::DimX, block_width * block_height);
+    Iterator out(output, win_out);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        std::fill_n(out.ptr(), block_width * block_height, 0);
+    },
+    out);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        for(unsigned int z = id.y(); (z < in_height) && z < (id.y() + block_height); ++z)
+        {
+            int j = (z - id.y()) * block_width;
+            for(unsigned int b = id.x(); (b < in_width) && (b < (id.x() + block_width)); ++b)
+            {
+                *(out.ptr() + j++) = *(input->buffer() + z * in_stride + b);
+            }
+        }
+    },
+    in, out);
+}
+} // namespace
+
+NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
+    : _block_height(0), _block_width(0), _transpose(false)
+{
+}
+
+void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height));
+
+    _input        = input;
+    _output       = output;
+    _block_height = block_height;
+    _block_width  = block_width;
+    _transpose    = transpose;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose)
+{
+    ARM_COMPUTE_UNUSED(transpose);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first);
+
+    return Status{};
+}
+
+void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    if(_transpose)
+    {
+        gemm_interleave_blocked_transposed_8bit(_input, _output, window, _block_width, _block_height);
+    }
+    else
+    {
+        gemm_interleave_blocked_8bit(_input, _output, window, _block_width, _block_height);
+    }
+}

diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index cbba446..9104f0b 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -41,44 +42,788 @@
 
 namespace arm_compute
 {
+namespace
+{
+void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
+{
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
+
+        // Note: Since the input are all positives, we can use uint32_t
+        // Accumulators for the block 0
+        uint32x4x4_t c0 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
+
+        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
+
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4x2_t a00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(a00_u8)),
+                    vget_high_u16(vmovl_u8(a00_u8))
+                }
+            };
+
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
+
+            const uint16x4x4_t b10_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
+                }
+            };
+
+            const uint16x4x4_t b20_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
+                }
+            };
+
+            const uint16x4x4_t b30_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
+                }
+            };
+
+            const uint16x4x4_t b40_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
+                }
+            };
+
+            const uint16x4x4_t b50_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
+                }
+            };
+
+            const uint16x4x4_t b60_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
+                }
+            };
+
+            const uint16x4x4_t b70_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
+                }
+            };
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
+
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
+
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
+
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+        vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+        vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+        vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+    },
+    ina, inb, out);
+}
+
+void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, size_t stride_b, const Window &window)
+{
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        if(id.x() > width_b)
+        {
+            return;
+        }
+
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+        auto vec_a_end_addr = vec_a + width_a;
+
+        // This for loop performs 8 accumulations
+        for(; vec_a <= (vec_a_end_addr - 8);)
+        {
+            const int8x8_t  a00_s8 = vld1_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+
+            // Convert a00_s8 to int16_t and get the lower part
+            const int16x4x2_t a00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(a00_s8)),
+                    vget_high_s16(vmovl_s8(a00_s8))
+                }
+            };
+
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            const int16x4x4_t b10_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
+                }
+            };
+
+            const int16x4x4_t b20_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
+                }
+            };
+
+            const int16x4x4_t b30_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
+                }
+            };
+
+            const int16x4x4_t b40_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
+                }
+            };
+
+            const int16x4x4_t b50_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
+                }
+            };
+
+            const int16x4x4_t b60_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
+                }
+            };
+
+            const int16x4x4_t b70_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
+                }
+            };
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+            // Accumulate 1:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+            // Accumulate 2:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+            // Accumulate 3:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+            // Accumulate 4:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+            // Accumulate 5:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+            // Accumulate 6:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+            // Accumulate 7:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+            vec_a += 8;
+            matrix_b += 8 * stride_b;
+        }
+
+        // This for loop performs the left-over accumulations
+        for(; vec_a < vec_a_end_addr;)
+        {
+            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
+
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            // Convert a00_s8 to uint16_t and get the lower part
+            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+            // Accumulate 0:
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+            vec_a += 1;
+            matrix_b += stride_b;
+        }
+
+        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(vec_out + 0, c0.val[0]);
+        vst1q_s32(vec_out + 4, c0.val[1]);
+        vst1q_s32(vec_out + 8, c0.val[2]);
+        vst1q_s32(vec_out + 12, c0.val[3]);
+    },
+    ina, inb, out);
+}
+
+void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
+{
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const uint8_t *mtx_a0 = ina.ptr();
+        const uint8_t *mtx_b0 = inb.ptr();
+
+        // Note: Since the input are all positives, we can use uint32_t
+        // Accumulators for the block 0
+        uint32x4x4_t c0 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
+
+        // Accumulators for the block 1
+        uint32x4x4_t c1 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
+
+        // Accumulators for the block 2
+        uint32x4x4_t c2 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
+
+        // Accumulators for the block 3
+        uint32x4x4_t c3 =
+        {
+            {
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0),
+                vdupq_n_u32(0)
+            }
+        };
+
+        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        {
+            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+            // Convert a00_u8 to uint16_t and get the lower part
+            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+            // Convert b00_s8 to uint16_t
+            const uint16x4x4_t b00_u16 =
+            {
+                {
+                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
+                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
+                }
+            };
+
+            // 4x4 block 0
+            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+            // 4x4 block 1
+            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+            // 4x4 block 2
+            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+            // 4x4 block 3
+            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
+        }
+
+        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+        vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+        vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+        vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+        vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+        vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+        vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+        vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
+    },
+    ina, inb, out);
+}
+
+void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
+{
+    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
+    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
+    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+        // Note: Since the input are all positives, we can use uint32_t
+        // Accumulators for the block 0
+        int32x4x4_t c0 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        // Accumulators for the block 1
+        int32x4x4_t c1 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        // Accumulators for the block 2
+        int32x4x4_t c2 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        // Accumulators for the block 3
+        int32x4x4_t c3 =
+        {
+            {
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0),
+                vdupq_n_s32(0)
+            }
+        };
+
+        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        {
+            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
+            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+            // Convert a00_s8 to uint16_t and get the lower part
+            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+            // Convert b00_s8 to int16_t
+            const int16x4x4_t b00_s16 =
+            {
+                {
+                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
+                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
+                }
+            };
+
+            // 4x4 block 0
+            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+            // 4x4 block 1
+            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+            // 4x4 block 2
+            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+            // 4x4 block 3
+            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
+        }
+
+        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+        vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+        vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+        vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+        vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+        vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+        vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+        vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+        vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
+    },
+    ina, inb, out);
+}
+} // namespace
+
 class Coordinates;
 } // namespace arm_compute
 
-NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _output_offset(0), _output_mult_int(0), _shift(0)
+namespace
 {
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    TensorShape in0_shape = input0->tensor_shape();
+    TensorShape in1_shape = input1->tensor_shape();
+    TensorShape out_shape = output->tensor_shape();
+
+    // Check vector-by-matrix case
+    if(out_shape[1] == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+    }
+    else
+    {
+        in0_shape.collapse(2);
+        in1_shape.collapse(2);
+        out_shape.collapse(2);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
+    }
+
+    return Status{};
 }
 
-void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-
-    _input0          = input0;
-    _input1          = input1;
-    _output          = output;
-    _a_offset        = a_offset;
-    _b_offset        = b_offset;
-    _output_offset   = output_offset;
-    _output_mult_int = output_mult_int;
-    _shift           = shift;
-
     constexpr unsigned int num_elems_processed_per_iteration_x = 16;
     constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win;
+    bool   window_changed = false;
 
-    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowHorizontal in0_access(input0->info(), 0, num_elems_processed_per_iteration_x);
-    AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x);
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->dimension(1) == 1))
+    {
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
 
-    update_window_and_padding(win, in0_access, in1_access, output_access);
+        // We cannot read out-of-bound elements from matrix A as we use the left-over for loop
+        AccessWindowStatic     in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);
+        AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-    INEKernel::configure(win);
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+    else
+    {
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        unsigned int num_k_iterations = ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x) / 16;
+        // For each iteration of "k" we increment the input pointer by 4, and we load 8 elements a the time:
+        AccessWindowStatic     in0_access(input0, 0, 0, (num_k_iterations - 1) * 4 + 8, input0->dimension(1));
+        AccessWindowHorizontal in1_access(input1, 0, input1->dimension(0));
+        AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+    TensorShape in1_shape = input1->info()->tensor_shape();
+    in1_shape.collapse(2);
+
+    _input0         = input0;
+    _input1         = input1;
+    _output         = output;
+    _slide_matrix_b = in1_shape[2] != 1;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+
+    return Status{};
 }
 
 void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)
@@ -87,338 +832,106 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
-    const size_t out_stride  = _output->info()->strides_in_bytes()[1];
-
-    /* Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix */
-    Window win_a(window);
-    win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1));
-
-    /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */
-    Window win_b(window);
-    win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride));
-    win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    /* The step x and step y for the output matrix has been already set using in configure() */
-    Iterator ina(_input0, win_a);
-    Iterator inb(_input1, win_b);
-    Iterator out(_output, window);
-
-    const int32x4_t voffset_a = vdupq_n_s32(_a_offset);
-    const int32x4_t voffset_b = vdupq_n_s32(_b_offset);
-    const int32x4_t vshiftr   = vdupq_n_s32(-_shift);
-
-    const int width_b = _input1->info()->dimension(0);
-
-    // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
-    // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
-    // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates &)
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
+    if((_output->info()->dimension(1) == 1))
     {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
+        const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));
+        const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));
+        const auto in_b_stride    = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));
 
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+        // The implementation computes 16 elements per iteration
+        const int window_start_x = 16 * info.thread_id;
+        const int window_step_x  = 16 * info.num_threads;
+        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+        Window win_out(window);
+        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_input1->info()->num_dimensions() >= 3)
         {
-            {
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset)
-            }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
-            {
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset),
-                vdupq_n_s32(_output_offset)
-            }
-        };
-
-        int k = 0;
-        // This for loop performs 4 accumulations per iteration
-        for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64)
-        {
-            const uint8x8_t p00  = vld1_u8(mtx_a0 + 0);
-            const uint8x8_t p01  = vld1_u8(mtx_a0 + 8);
-            const uint8x8_t q00l = vld1_u8(mtx_b0 + 0);
-            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
-            const uint8x8_t q01l = vld1_u8(mtx_b0 + 16);
-            const uint8x8_t q01h = vld1_u8(mtx_b0 + 24);
-            const uint8x8_t q02l = vld1_u8(mtx_b0 + 32);
-            const uint8x8_t q02h = vld1_u8(mtx_b0 + 40);
-            const uint8x8_t q03l = vld1_u8(mtx_b0 + 48);
-            const uint8x8_t q03h = vld1_u8(mtx_b0 + 56);
-
-            const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
-            const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00))));
-            const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01))));
-            const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01))));
-
-            const int32x2x4_t ia0 =
-            {
-                {
-                    vget_low_s32(ia0l),
-                    vget_high_s32(ia0l),
-                    vget_low_s32(ia0h),
-                    vget_high_s32(ia0h)
-                }
-            };
-
-            const int32x2x4_t ia1 =
-            {
-                {
-                    vget_low_s32(ia1l),
-                    vget_high_s32(ia1l),
-                    vget_low_s32(ia1h),
-                    vget_high_s32(ia1h)
-                }
-            };
-
-            const int32x4x4_t ib0 =
-            {
-                {
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
-                }
-            };
-
-            const int32x4x4_t ib1 =
-            {
-                {
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h))))
-                }
-            };
-
-            const int32x4x4_t ib2 =
-            {
-                {
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h))))
-                }
-            };
-
-            const int32x4x4_t ib3 =
-            {
-                {
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h))))
-                }
-            };
-
-            // 4x4 block 0 - Accumulation 0
-            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0);
-            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1);
-            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0);
-            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1);
-            // 4x4 block 0 - Accumulation 1
-            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0);
-            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1);
-            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0);
-            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1);
-            // 4x4 block 0 - Accumulation 2
-            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0);
-            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1);
-            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0);
-            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1);
-            // 4x4 block 0 - Accumulation 3
-            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0);
-            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1);
-            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0);
-            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1);
-
-            // 4x4 block 1 - Accumulation 0
-            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0);
-            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1);
-            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0);
-            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1);
-            // 4x4 block 1 - Accumulation 1
-            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0);
-            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1);
-            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0);
-            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1);
-            // 4x4 block 1 - Accumulation 2
-            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0);
-            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1);
-            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0);
-            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1);
-            // 4x4 block 1 - Accumulation 3
-            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0);
-            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1);
-            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0);
-            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1);
-
-            // 4x4 block 2 - Accumulation 0
-            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0);
-            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1);
-            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0);
-            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1);
-            // 4x4 block 2 - Accumulation 1
-            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0);
-            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1);
-            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0);
-            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1);
-            // 4x4 block 2 - Accumulation 2
-            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0);
-            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1);
-            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0);
-            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1);
-            // 4x4 block 2 - Accumulation 3
-            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0);
-            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1);
-            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0);
-            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1);
-
-            // 4x4 block 3 - Accumulation 0
-            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0);
-            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1);
-            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0);
-            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1);
-            // 4x4 block 3 - Accumulation 1
-            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0);
-            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1);
-            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0);
-            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1);
-            // 4x4 block 3 - Accumulation 2
-            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0);
-            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1);
-            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0);
-            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1);
-            // 4x4 block 3 - Accumulation 3
-            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0);
-            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1);
-            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0);
-            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1);
+            win_b = window;
         }
+        win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+        win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-        // This for loop handles the left-over accumulations
-        for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, win_out);
+
+        switch(_input0->info()->data_type())
         {
-            const uint8x8_t p00  = vld1_u8(mtx_a0);
-            const uint8x8_t q00l = vld1_u8(mtx_b0);
-            const uint8x8_t q00h = vld1_u8(mtx_b0 + 8);
-
-            const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00))));
-
-            const int32x2x2_t ia =
+            case DataType::S8:
             {
-                {
-                    vget_low_s32(ia0),
-                    vget_high_s32(ia0)
-                }
-            };
-
-            const int32x4x4_t ib0 =
-            {
-                {
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))),
-                    vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h))))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0);
-            c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1);
-            c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0);
-            c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1);
-
-            // 4x4 block 1
-            c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0);
-            c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1);
-            c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0);
-            c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0);
-            c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1);
-            c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0);
-            c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1);
-
-            // 4x4 block 3
-            c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0);
-            c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1);
-            c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0);
-            c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1);
-        }
-
-        c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr);
-        c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr);
-        c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr);
-        c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr);
-
-        c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr);
-        c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr);
-        c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr);
-        c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr);
-
-        c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr);
-        c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr);
-        c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr);
-        c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr);
-
-        c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr);
-        c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr);
-        c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr);
-        c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr);
-
-        const uint8x16x4_t r =
-        {
-            {
-                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))),
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))),
-                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))),
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))),
-                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))),
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))),
-                vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))),
-                vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3]))))
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
             }
-        };
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
+        }
+    }
+    else
+    {
+        const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];
+        const size_t out_stride  = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
 
-        uint8_t *const mtx_out = out.ptr();
-        vst1q_u8(mtx_out + 0 * out_stride, r.val[0]);
-        vst1q_u8(mtx_out + 1 * out_stride, r.val[1]);
-        vst1q_u8(mtx_out + 2 * out_stride, r.val[2]);
-        vst1q_u8(mtx_out + 3 * out_stride, r.val[3]);
-    },
-    ina, inb, out);
+        // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
+        Window win_a(window);
+        win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));
+
+        // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix
+        Window win_b;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        if(_slide_matrix_b)
+        {
+            win_b = window;
+        }
+        win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride));
+        win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        // The step x and step y for the output matrix has been already set using in configure()
+        Iterator ina(_input0, win_a);
+        Iterator inb(_input1, win_b);
+        Iterator out(_output, window);
+
+        const int width_b = _input1->info()->dimension(0);
+        switch(_input0->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            case DataType::U8:
+            case DataType::QASYMM8:
+            {
+                matrix_multiply_u8(ina, inb, out, width_b, out_stride, window);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Not supported");
+                break;
+            }
+        }
+    }
 }

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000..f696400
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp

@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                          int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(2);
+
+            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
+                                         && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                         "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+                                                        int32_t a_offset, int32_t b_offset)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    bool                   window_changed                    = false;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win,
+                                                                 mm_result_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_col_access);
+    }
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_row_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+{
+}
+
+void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+                                                  a_offset, b_offset));                                         // NOLINT
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+    _a_offset       = a_offset;
+    _b_offset       = b_offset;
+    _k_offset       = a_offset * b_offset * k;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT
+        vector_sum_col_shape.collapse(1);
+
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(mm_result->info(),
+                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+                                                    a_offset, b_offset);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                                                    int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              a_offset, b_offset)
+                                .first); // NOLINT
+
+    return Status{};
+}
+
+void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ);
+
+    if(_a_offset != 0 && _b_offset != 0) // true, true
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        if(!_slide_vector_sum_col)
+        {
+            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        }
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
+        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
+
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
+
+            // Add a_offset_term_s32 and b_offset_term_s32
+            int32x4x4_t offset_term_s32 =
+            {
+                {
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset),
+                    vdupq_n_s32(_k_offset)
+                }
+            };
+
+            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
+            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
+            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
+            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col, vector_sum_row, mm_result);
+    }
+    else if((_a_offset == 0) && (_b_offset != 0)) // false, true
+    {
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr()) + id.y());
+            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_row, mm_result);
+    }
+    else if((_a_offset != 0) && (_b_offset == 0)) // true, false
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        if(!_slide_vector_sum_col)
+        {
+            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+        }
+
+        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
+        Iterator mm_result(_mm_result, window);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col, mm_result);
+    }
+    else // false, false
+    {
+        // No offset contribution from matrix A and matrix B
+        return;
+    }
+}

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000..8b3f238
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop, we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration is set to 1
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, output_result_access);
+
+        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+template <bool    is_bounded_relu>
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8,
+                                        uint8x16_t max_u8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
+    in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
+    in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
+    in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
+
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
+    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
+    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
+    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+
+/* Function used by the left-over for loop to perform the quantization */
+template <bool is_bounded_relu>
+inline uint8_t finalize_quantization(int32x4_t in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8_t min_u8, uint8_t max_u8)
+{
+    const static int32x4_t zero_s32      = vdupq_n_s32(0);
+    const static int32x4_t sat_value_s32 = vdupq_n_s32(255);
+
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_s32 = vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier);
+
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    in_s32 = rounding_divide_by_pow2(in_s32, result_shift);
+
+    // Add the offset terms
+    in_s32 = vaddq_s32(in_s32, result_offset_after_shift_s32);
+
+    // Saturate negative values
+    in_s32 = vmaxq_s32(in_s32, zero_s32);
+    in_s32 = vminq_s32(in_s32, sat_value_s32);
+
+    auto out_u8 = static_cast<uint8_t>(vgetq_lane_s32(in_s32, 0));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = std::max(out_u8, min_u8);
+        out_u8 = std::min(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+} // namespace
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window)
+{
+    const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
+    const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
+    const uint8x16_t max_u8                        = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_u8);
+    ARM_COMPUTE_UNUSED(max_u8);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    if(_bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias(_bias, win_biases);
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                const int32x4x4_t bias_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
+                    }
+                };
+
+                // Add the bias to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias.ptr()) + x);
+                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Add bias
+                in_value += bias_value;
+
+                // Finalize and store the result
+                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min),
+                                                                          static_cast<uint8_t>(_max));
+            }
+        },
+        in, bias, out);
+    }
+    else
+    {
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Finalize and store the result
+                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+            }
+        },
+        in, out);
+    }
+}
+
+NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0)
+{
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                          int result_offset_after_shift, int min, int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  min,
+                                                  max));
+
+    _input                        = input;
+    _bias                         = bias;
+    _output                       = output;
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+    _min                          = min;
+    _max                          = max;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run<false>;
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Status{};
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
new file mode 100644
index 0000000..573373f
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp

@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop, we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration is set to 1
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, output_result_access);
+
+        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
+{
+    // Add the offset terms to GEMM's result
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+    // Multiply by result_mult_int
+    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
+    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
+    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
+    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
+}
+
+template <bool    is_bounded_relu>
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+} // namespace
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window)
+{
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(_result_offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(-_result_shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(_min));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_u8);
+    ARM_COMPUTE_UNUSED(max_u8);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(_input, win);
+    Iterator out(_output, win);
+
+    if(_bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias(_bias, win_biases);
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                const int32x4x4_t bias_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + x + 12)
+                    }
+                };
+
+                // Add the bias to GEMM's result
+                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                // Add the offset terms to GEMM's result and multiply by result_mult_int
+                scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const int bias_value = *(reinterpret_cast<const int *>(bias.ptr()) + x);
+                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                // Quantize
+                in_value = ((in_value + bias_value + _result_offset) * _result_mult_int) >> _result_shift;
+
+                // Finalize and store the result
+                if(is_bounded_relu)
+                {
+                    *(out.ptr() + x) = static_cast<uint8_t>(std::max(_min, std::min(_max, in_value)));
+                }
+                else
+                {
+                    *(out.ptr() + x) = static_cast<uint8_t>(std::max(0, std::min(255, in_value)));
+                }
+            }
+        },
+        in, bias, out);
+    }
+    else
+    {
+        execute_window_loop(win, [&](const Coordinates & id)
+        {
+            // Compute 16 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                int32x4x4_t in_s32 =
+                {
+                    {
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
+                    }
+                };
+
+                // Add the offset terms to GEMM's result and multiply by result_mult_int
+                scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+
+                // Quantize
+                in_value = ((in_value + _result_offset) * _result_mult_int) >> _result_shift;
+
+                // Finalize and store the result
+                if(is_bounded_relu)
+                {
+                    *(out.ptr() + x) = static_cast<uint8_t>(std::max(_min, std::min(_max, in_value)));
+                }
+                else
+                {
+                    *(out.ptr() + x) = static_cast<uint8_t>(std::max(0, std::min(255, in_value)));
+                }
+            }
+        },
+        in, out);
+    }
+}
+
+NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0)
+{
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  min,
+                                                  max));
+
+    _input           = input;
+    _bias            = bias;
+    _output          = output;
+    _result_offset   = result_offset;
+    _result_mult_int = result_mult_int;
+    _result_shift    = result_shift;
+    _min             = min;
+    _max             = max;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    // Check if we need to clamp the result using min and max
+    const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Status{};
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    (this->*_func)(window);
+}

diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
new file mode 100644
index 0000000..0aadfc9
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp

@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Status{};
+}
+std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
+{
+    const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
+    : _input(), _output(), _k(0), _is_reshaped(false)
+{
+}
+
+void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
+
+    _input       = mtx_a;
+    _output      = vector_sum_row;
+    _k           = num_mtx_a_cols;
+    _is_reshaped = is_interleaved4x4;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+{
+    ARM_COMPUTE_UNUSED(num_mtx_a_cols);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
+
+    return Status{};
+}
+
+void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window win_input(collapsed_window);
+    win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input, win_input);
+    Iterator out(_output, collapsed_window);
+
+    if(_is_reshaped)
+    {
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
+        {
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            uint32x4_t sum_row = vdupq_n_u32(0);
+
+            const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+#endif /* __arm__ */
+
+            int i = 0;
+            // This for loop performs 4 accumulations
+            for(; i <= (_k - 4); i += 4)
+            {
+                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
+
+                // Convert U8 to U16
+                uint16x4x4_t a0_u16 =
+                {
+                    {
+                        vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
+                        vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
+                        vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
+                        vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
+                    }
+                };
+
+                // Accumulate to U16
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
+                a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
+
+                // Accumulate to U32
+                sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
+            }
+
+            // This for loop performs the leftover accumulations
+            for(; i < _k; ++i)
+            {
+                const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
+
+                // Convert U8 to U16
+                const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
+
+                // Accumulate to U32
+                sum_row = vaddw_u16(sum_row, a0_u16);
+            }
+
+            auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
+
+            vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
+        },
+        in, out);
+    }
+    else // it is not reshaped
+    {
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
+        {
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            uint32x4_t sum_row_u32 = vdupq_n_u32(0);
+            uint32_t   sum_row     = 0;
+
+            const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+#endif /* __arm__ */
+
+            int i = 0;
+            // This for loop performs 16 accumulations
+            for(; i <= (_k - 16); i += 16)
+            {
+                const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
+
+                // Partial accumulations in U16
+                const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
+
+                // Accumulate to U32
+                sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
+            }
+
+            // This for loop performs the leftover accumulations
+            for(; i < _k; ++i)
+            {
+                sum_row += static_cast<uint32_t>(matrix_a[i]);
+            }
+
+#if defined(__aarch64__)
+            // Reduction operation available on 64 bit architectures only
+            sum_row += vaddvq_u32(sum_row_u32);
+#else  // __aarch64__
+            uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
+            tmp            = vpadd_u32(tmp, tmp);
+
+            sum_row += vget_lane_u32(tmp, 0);
+#endif // __aarch64__
+
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
+        },
+        in, out);
+    }
+}
+
+void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
+
+    _input       = mtx_b;
+    _output      = vector_sum_col;
+    _k           = num_mtx_b_rows;
+    _is_reshaped = is_transposed1xW;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+{
+    ARM_COMPUTE_UNUSED(num_mtx_b_rows);
+    ARM_COMPUTE_UNUSED(is_transposed1xW);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
+
+    return Status{};
+}
+
+void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    if(_is_reshaped)
+    {
+        Window win_input(collapsed_window);
+        win_input.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator in(_input, win_input);
+        Iterator out(_output, collapsed_window);
+
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
+        {
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            uint32x4x4_t sum_col =
+            {
+                {
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0)
+                }
+            };
+
+            const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+#endif /* __arm__ */
+
+            int i = 0;
+            for(; i < _k; ++i)
+            {
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
+
+                // Convert S8 to U16
+                const uint16x8x2_t b0_u16 =
+                {
+                    {
+                        vmovl_u8(vget_low_u8(b0_u8)),
+                        vmovl_u8(vget_high_u8(b0_u8))
+                    }
+                };
+
+                // Accumulate to U32
+                sum_col =
+                {
+                    {
+                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
+                    }
+                };
+            }
+
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+
+            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
+            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
+            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
+            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+        },
+        in, out);
+    }
+    else // it is not reshaped
+    {
+        const auto width_matrix_b = static_cast<int>(_input->info()->dimension(0));
+        const auto in_b_stride    = static_cast<int>(_input->info()->strides_in_bytes()[1]);
+
+        // The implementation computes 16 elements per iteration
+        const int window_start_x = 16 * info.thread_id;
+        const int window_step_x  = 16 * info.num_threads;
+        // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
+        const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
+
+        Window win_out(collapsed_window);
+        win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
+
+        Window win_in(win_out);
+        win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator inb(_input, win_in);
+        Iterator out(_output, win_out);
+
+        execute_window_loop(win_out, [&](const Coordinates & id)
+        {
+            if(id.x() > width_matrix_b)
+            {
+                return;
+            }
+
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            uint32x4x4_t sum_col =
+            {
+                {
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0),
+                    vdupq_n_u32(0)
+                }
+            };
+
+            const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
+
+#if __arm__
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+#endif /* __arm__ */
+
+            int i = 0;
+            // This for loop performs 4 accumulations
+            for(; i <= (_k - 4); i += 4)
+            {
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+                const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
+                const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
+                const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+#endif /* __arm__ */
+
+                // Partial accumulation in u16
+                uint16x8x2_t tmp_sum =
+                {
+                    {
+                        vdupq_n_u16(0),
+                        vdupq_n_u16(0)
+                    }
+                };
+
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
+                tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
+                tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
+
+                // Accumulate to U32
+                sum_col =
+                {
+                    {
+                        vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
+                    }
+                };
+
+                matrix_b += 4 * in_b_stride;
+            }
+
+            // This for loop perfoms the leftover accumulations
+            for(; i < _k; ++i)
+            {
+                const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+
+                // Convert S8 to S16
+                const uint16x8x2_t b0_u16 =
+                {
+                    {
+                        vmovl_u8(vget_low_u8(b0_u8)),
+                        vmovl_u8(vget_high_u8(b0_u8))
+                    }
+                };
+
+                // Accumulate to U32
+                sum_col =
+                {
+                    {
+                        vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
+                        vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
+                        vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
+                    }
+                };
+
+                matrix_b += in_b_stride;
+            }
+
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+
+            vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
+            vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
+            vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
+            vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+        },
+        inb, out);
+    }
+}

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index fb07cb0..3dd59bd 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
@@ -60,7 +61,7 @@
 
     update_window_and_padding(win,
                               AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowStatic(biases->info(), 0, 0, win.x().end(), biases->info()->tensor_shape().y()));
+                              AccessWindowStatic(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->tensor_shape().y()));
 
     AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration);
 
@@ -108,7 +109,7 @@
             in0_out, in1);
             break;
         }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
             execute_window_loop(window, [&](const Coordinates & id)
@@ -128,7 +129,7 @@
             in0_out, in1);
             break;
         }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::QS8:
         {
             execute_window_loop(window, [&](const Coordinates & id)

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 9dbce1d..dfba743 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp

@@ -66,7 +66,7 @@
     in, out);
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
     const float16x8_t beta_f16 = vdupq_n_f16(beta);
@@ -89,7 +89,7 @@
     },
     in, out);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
@@ -167,10 +167,10 @@
             _func = &matrix_addition_qs16;
             break;
         case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             _func = &matrix_addition_f16;
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 6909082..a583c1d 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -53,7 +53,7 @@
 template <bool multiply_alpha>
 void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
@@ -186,7 +186,7 @@
 
     },
     ina, inb, out);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(input0);
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_UNUSED(output);
@@ -194,7 +194,7 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_ERROR("Not implemented");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 template <bool multiply_alpha>
@@ -915,7 +915,7 @@
 template <bool multiply_alpha>
 void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     const size_t in_b_stride          = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
     const size_t out_stride           = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
     const int    num_elems_matrix_b_x = input1->info()->dimension(0);
@@ -1051,14 +1051,14 @@
         vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
     },
     ina, inb, out);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(input0);
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_ERROR("Not implemented");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 template <bool multiply_alpha>
@@ -1454,13 +1454,13 @@
                 num_elems_processed_per_iteration_x = 16;
                 break;
             }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 num_elems_processed_per_iteration_x = 32;
                 break;
             }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1503,13 +1503,13 @@
                 num_elems_processed_per_iteration_x = 8;
                 break;
             }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 num_elems_processed_per_iteration_x = 8;
                 break;
             }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1563,14 +1563,14 @@
                 vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);
                 break;
             }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 multiply_alpha ? vector_matrix_multiply_f16<true>(_input0, _input1, _output, window, info, _alpha) :
                 vector_matrix_multiply_f16<false>(_input0, _input1, _output, window, info, _alpha);
                 break;
             }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -1600,14 +1600,14 @@
                 matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);
                 break;
             }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 multiply_alpha ? matrix_matrix_multiply_f16<true>(_input0, _input1, _output, window, _alpha) :
                 matrix_matrix_multiply_f16<false>(_input0, _input1, _output, window, _alpha);
                 break;
             }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             default:
             {
                 ARM_COMPUTE_ERROR("Data type not supported");

diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
new file mode 100644
index 0000000..fe79df2
--- /dev/null
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
+    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+
+    _input0 = input0;
+    _input1 = input1;
+    _output = output;
+
+    // Configure kernel window
+    const unsigned int num_elems_read_per_iteration = 4;
+
+    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
+
+    AccessWindowHorizontal input0_access(input0->info(), 0, num_elems_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+    update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+    Window window_slice = window.first_slice_window_3D();
+
+    Window window_in(window);
+    Window window_weights(window_slice);
+    Window window_out(window);
+
+    // Setup input0 slice
+    window_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
+    window_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1), 1));
+    window_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
+
+    // Setup input1 and output slice. Their dimensions are increased in the kernel.
+    window_weights.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_weights.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_weights.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Iterator in(_input0, window_in);
+    Iterator in2(_input1, window_weights);
+    Iterator out(_output, window_out);
+
+    const int input_w          = _input0->info()->dimension(0);
+    const int input_h          = _input0->info()->dimension(1);
+    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
+    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+    const int output_stride_x  = _output->info()->strides_in_bytes().x();
+
+    execute_window_loop(window_in, [&](const Coordinates & id)
+    {
+        // Get pointers
+        const uint8_t *const input_ptr   = in.ptr();
+        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+        auto                 output_ptr  = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+        float32x4_t row_dot = vdupq_n_f32(0.f);
+        for(int i = 0; i < input_w; i += 4)
+        {
+            const auto input   = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x));
+            const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x));
+            row_dot            = vaddq_f32(row_dot, vmulq_f32(input, weights));
+        }
+
+        auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot));
+        temp      = vpadd_f32(temp, temp);
+
+        *output_ptr = vget_lane_f32(temp, 0);
+    },
+    in, in2, out);
+}

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 7f4ee1e..a88dc65 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

@@ -41,44 +41,87 @@
 
 using namespace arm_compute;
 
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input)
+{
+    TensorShape  output_shape{ input->tensor_shape() };
+    const size_t transpose_w = 16 / input->element_size();
+    output_shape.set(0, input->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->dimension(0) / static_cast<float>(transpose_w)))));
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const int          scale_x                           = num_elems_processed_per_iteration;
+    bool               window_changed                    = false;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t transpose_w = 16 / input->info()->element_size();
-    output_shape.set(0, input->info()->dimension(1) * transpose_w);
-    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const int          scale_x                           = num_elems_processed_per_iteration;
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
+Status NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index d6cb1b6..7a123e2 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp

@@ -41,20 +41,19 @@
 using namespace arm_compute;
 
 NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel()
-    : _border_size(0), _l2_load_offset(0)
+    : _l2_load_offset(0)
 {
 }
 
 BorderSize NEGaussianPyramidHorKernel::border_size() const
 {
-    return _border_size;
+    return BorderSize(0, 2);
 }
 
-void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -62,17 +61,16 @@
         ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
     }
 
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+    _input  = input;
+    _output = output;
 
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
     constexpr unsigned int num_elems_read_per_iteration      = 32;
     constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr float        scale_x                           = 0.5f;
+    const float            scale_x                           = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
 
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
 
     // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
@@ -97,15 +95,12 @@
         _l2_load_offset += 1;
     }
 
+    // Replace input access with static window
     update_window_and_padding(win,
                               AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
                               output_access);
 
-    ValidRegion valid_region = input->info()->valid_region();
-    valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
-    valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
-
-    output_access.set_valid_region(win, valid_region);
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
@@ -163,13 +158,11 @@
     return BorderSize(2, 0);
 }
 
-void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
+void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
@@ -189,9 +182,9 @@
     constexpr unsigned int num_elems_read_per_iteration = 16;
     constexpr unsigned int num_rows_read_per_iteration  = 5;
 
-    constexpr float scale_y = 0.5f;
+    const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
 
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), border_undefined, border_size());
+    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
     AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration, 1.f, scale_y);
 
     // Determine whether we need to load even or odd rows. See above for a
@@ -207,11 +200,7 @@
                               AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_read_per_iteration),
                               output_access);
 
-    ValidRegion valid_region = input->info()->valid_region();
-    valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
-    valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
-
-    output_access.set_valid_region(win, valid_region);
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }

diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 233b2ba..14fa1b4 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp

@@ -39,7 +39,7 @@
 
 using namespace arm_compute;
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template class arm_compute::NEHarrisScoreFP16Kernel<3>;
 template class arm_compute::NEHarrisScoreFP16Kernel<5>;
@@ -361,7 +361,7 @@
     INEKernel::configure(win);
 }
 
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template class arm_compute::NEHarrisScoreKernel<3>;
 template class arm_compute::NEHarrisScoreKernel<5>;

diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 6e402ae..02de566 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp

@@ -87,8 +87,8 @@
         }
     };
 
-    const unsigned int x_start = win.x().start();
-    const unsigned int x_end   = win.x().end();
+    const int x_start = win.x().start();
+    const int x_end   = win.x().end();
 
     // Handle X dimension manually to split into two loops
     // First one will use vector operations, second one processes the left over
@@ -100,7 +100,7 @@
     // Calculate local histogram
     execute_window_loop(win, [&](const Coordinates &)
     {
-        unsigned int x = x_start;
+        int x = x_start;
 
         // Vector loop
         for(; x <= x_end - 8; x += 8)
@@ -136,8 +136,8 @@
 
     std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
 
-    const unsigned int x_start = win.x().start();
-    const unsigned int x_end   = win.x().end();
+    const int x_start = win.x().start();
+    const int x_end   = win.x().end();
 
     // Handle X dimension manually to split into two loops
     // First one will use vector operations, second one processes the left over
@@ -149,7 +149,7 @@
     // Calculate local histogram
     execute_window_loop(win, [&](const Coordinates &)
     {
-        unsigned int x = x_start;
+        int x = x_start;
 
         // Vector loop
         for(; x <= x_end - 8; x += 8)

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 71910e3..8eb235b 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -42,6 +42,18 @@
 
 namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
+    ARM_COMPUTE_UNUSED(kernel_dims);
+    ARM_COMPUTE_UNUSED(conv_info);
+
+    return Status{};
+}
+
 template <typename T, bool has_pads>
 inline void linearize_volume(const uint8_t *const in_ptr,
                              T                   *out_ptr,
@@ -163,16 +175,17 @@
     const int input_stride_y = _input->info()->strides_in_bytes().y();
     const int input_stride_z = _input->info()->strides_in_bytes().z();
 
-    int pad_x    = 0;
-    int pad_y    = 0;
+    int pad_left = 0;
+    int pad_top  = 0;
     int stride_x = 0;
     int stride_y = 0;
-    std::tie(pad_x, pad_y)       = _conv_info.pad();
+    pad_left     = _conv_info.pad_left();
+    pad_top      = _conv_info.pad_top();
     std::tie(stride_x, stride_y) = _conv_info.stride();
 
     // Setup input window
-    const int start_x = -pad_x;
-    const int start_y = -pad_y;
+    const int start_x = -pad_left;
+    const int start_y = -pad_top;
 
     Window window_in(window);
     // The first three dimensions of the input are increased by the inner loops
@@ -277,9 +290,10 @@
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias));
 
     _input          = input;
     _output         = output;
@@ -291,18 +305,15 @@
                                         _conv_info);
     _has_bias = has_bias;
 
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    std::tie(pad_x, pad_y)       = conv_info.pad();
     std::tie(stride_x, stride_y) = conv_info.stride();
 
     bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                               input->info()->tensor_shape().cend(),
                                               output->info()->tensor_shape().cbegin() + 1))
-                               && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
     Window window = calculate_max_window(*input->info(), Steps());
 
@@ -313,11 +324,11 @@
             case DataType::F32:
                 _func = &NEIm2ColKernel::run_reduced<float>;
                 break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
                 _func = &NEIm2ColKernel::run_reduced<float16_t>;
                 break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QS8:
                 _func = &NEIm2ColKernel::run_reduced<qint8_t>;
                 break;
@@ -334,18 +345,18 @@
         switch(_input->info()->data_type())
         {
             case DataType::F32:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
                 break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
                 break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QS8:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
                 break;
             case DataType::QS16:
-                _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -362,6 +373,12 @@
     IKernel::configure(window);
 }
 
+Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias));
+    return Status{};
+}
+
 void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
similarity index 93%
rename from src/core/NEON/kernels/NEL2NormalizeKernel.cpp
rename to src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 12c532a..3bf1d940 100644
--- a/src/core/NEON/kernels/NEL2NormalizeKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h"
+#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -69,12 +69,12 @@
 }
 } // namespace
 
-NEL2NormalizeKernel::NEL2NormalizeKernel()
+NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
     : _input(nullptr), _sum(nullptr), _output(nullptr), _axis(0), _epsilon(1e-12)
 {
 }
 
-void NEL2NormalizeKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, unsigned int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Normalization axis greater than max number of dimensions");
@@ -109,7 +109,7 @@
     INEKernel::configure(win);
 }
 
-void NEL2NormalizeKernel::run(const Window &window, const ThreadInfo &info)
+void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 1b2942c..52e3006 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -51,7 +51,7 @@
 {
 void vector_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     const auto width_matrix_b  = static_cast<int>(output->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
@@ -160,14 +160,14 @@
         vst1q_f16(vec_out + 24, acc3);
     },
     ina, out);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(input0);
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR("Not supported, recompile with -march=armv8.2-a+fp16+simd.");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 433985f..2d7c29d 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp

@@ -51,7 +51,7 @@
 constexpr float COEFF2       = 0.2447f;
 } // namespace
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 namespace fp16
 {
 inline float16x8_t inv(float16x8_t x)
@@ -143,7 +143,7 @@
 
 inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
 {
-    return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2));
+    return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
 }
 
 inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
@@ -429,7 +429,7 @@
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
 template class arm_compute::NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 namespace
 {
@@ -575,11 +575,8 @@
 
 inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
 {
-    int16x8_t gx_abs = vabsq_s16(input1);
-    int16x8_t gy_abs = vabsq_s16(input2);
-
     /* Saturating add */
-    return vqaddq_s16(gx_abs, gy_abs);
+    return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
 }
 
 inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)

diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 54ef33e..5bcdc7b 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp

@@ -61,13 +61,12 @@
     constexpr unsigned int num_elems_read_per_iteration      = 16;
     constexpr unsigned int num_elems_written_per_iteration   = 8;
     constexpr unsigned int num_rows_read_per_iteration       = 3;
-    constexpr int          rect_offset_xy                    = -1;
 
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
 
     update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration),
+                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
                               output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());

diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index c7dc03c..ad66acd 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/utility.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -319,34 +320,14 @@
     return false;
 }
 
-template <unsigned int...>
-struct index_seq
-{
-    index_seq()                  = default;
-    index_seq(const index_seq &) = default;
-    index_seq &operator=(const index_seq &) = default;
-    index_seq(index_seq &&) noexcept        = default;
-    index_seq &operator=(index_seq &&) noexcept = default;
-    virtual ~index_seq()                        = default;
-};
-template <unsigned int N, unsigned int... S>
-struct gen_index_seq : gen_index_seq < N - 1, N - 1, S... >
-{
-};
-template <unsigned int... S>
-struct gen_index_seq<0u, S...> : index_seq<S...>
-{
-    using type = index_seq<S...>;
-};
-
-template <class T, unsigned int... N>
-struct NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>
+template <class T, std::size_t... N>
+struct NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>
 {
     static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)];
 };
 
-template <class T, unsigned int... N>
-const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, index_seq<N...>>::func_table[sizeof...(N)] =
+template <class T, std::size_t... N>
+const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table[sizeof...(N)] =
 {
     &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
 };
@@ -378,13 +359,13 @@
     switch(input->info()->data_type())
     {
         case DataType::U8:
-            _func = create_func_table<uint8_t, gen_index_seq<16>::type>::func_table[table_idx];
+            _func = create_func_table<uint8_t, utility::index_sequence_t<16>>::func_table[table_idx];
             break;
         case DataType::S16:
-            _func = create_func_table<int16_t, gen_index_seq<16>::type>::func_table[table_idx];
+            _func = create_func_table<int16_t, utility::index_sequence_t<16>>::func_table[table_idx];
             break;
         case DataType::F32:
-            _func = create_func_table<float, gen_index_seq<16>::type>::func_table[table_idx];
+            _func = create_func_table<float, utility::index_sequence_t<16>>::func_table[table_idx];
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");

diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index ba68de6..a6e2b00 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp

@@ -747,19 +747,20 @@
     Iterator input(_input, win);
     Iterator output(_output, win);
 
-    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
-    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+    static const uint8x16_t zero           = vdupq_n_u8(0);
+    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
+    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
         const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
 
         uint8x8_t d[] =
         {
@@ -808,19 +809,20 @@
     Iterator input(_input, win);
     Iterator output(_output, win);
 
-    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
-    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+    static const uint8x16_t zero           = vdupq_n_u8(0);
+    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
+    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
         const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
 
         const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
         uint8x16_t       rows_min_5 = vminq_u8(top_data, bot_data);
@@ -840,19 +842,20 @@
     Iterator input(_input, win);
     Iterator output(_output, win);
 
-    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -2)));
-    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 2)));
+    static const uint8x16_t zero           = vdupq_n_u8(0);
+    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
+    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
+    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
+    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
+    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
 
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
+        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
         const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
         const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
         const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
+        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
 
         const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
         uint8x16_t       rows_max_5 = vmaxq_u8(top_data, bot_data);

diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index b7dfb59..8f97e6a 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp

@@ -41,7 +41,7 @@
 class Coordinates;
 } // namespace arm_compute
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 namespace fp16
 {
 inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask)
@@ -224,7 +224,7 @@
 
     INEKernel::configure(win);
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 namespace
 {

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index d6d26e2..776cb27 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -34,6 +34,67 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+
+    if(is_data_type_fixed_point(input->data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared);
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *input_squared, ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+    unsigned int       num_elems_processed_per_iteration = 16 / input->element_size();
+    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
+    const unsigned int border_width                      = (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+    BorderSize         border_size                       = BorderSize(0, border_width);
+    bool               window_changed                    = false;
+
+    // Configure window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, input_squared_access, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, input_squared_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 NENormalizationLayerKernel::NENormalizationLayerKernel()
     : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
 {
@@ -46,22 +107,14 @@
 
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared, output);
-    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
-    }
+    auto_init_if_empty(*output->info(), *input->info());
 
-    const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
+
+    const unsigned int border_width = (norm_info.is_cross_map()) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
 
     _input         = input;
     _input_squared = input_squared;
@@ -69,14 +122,10 @@
     _norm_info     = norm_info;
     _border_size   = BorderSize(0, border_width);
 
-    unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    ARM_COMPUTE_UNUSED(num_elems_processed_per_iteration);
-
     switch(_input->info()->data_type())
     {
         case DataType::F32:
         {
-            num_elems_processed_per_iteration = 4;
             switch(norm_info.type())
             {
                 case NormType::IN_MAP_1D:
@@ -90,14 +139,12 @@
                     _func = &NENormalizationLayerKernel::normalize_float<DataType::F32, 2, false>;
                     break;
                 default:
-                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
         }
         case DataType::F16:
         {
-            num_elems_processed_per_iteration = 8;
             switch(norm_info.type())
             {
                 case NormType::IN_MAP_1D:
@@ -111,14 +158,12 @@
                     _func = &NENormalizationLayerKernel::normalize_float<DataType::F16, 2, false>;
                     break;
                 default:
-                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
         }
         case DataType::QS8:
         {
-            num_elems_processed_per_iteration = 16;
             switch(norm_info.type())
             {
                 case NormType::IN_MAP_1D:
@@ -132,14 +177,12 @@
                     _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
                     break;
                 default:
-                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
         }
         case DataType::QS16:
         {
-            num_elems_processed_per_iteration = 8;
             switch(norm_info.type())
             {
                 case NormType::IN_MAP_1D:
@@ -153,7 +196,6 @@
                     _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
                     break;
                 default:
-                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -162,21 +204,10 @@
             ARM_COMPUTE_ERROR("NOT SUPPORTED!");
     }
 
-    const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-    const unsigned int num_rows                     = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle  input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
-    AccessWindowRectangle  input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, input_squared_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), input_squared->info(), output->info(), norm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 template <DataType dt, unsigned int dim, bool do_2D_norm>
@@ -232,7 +263,7 @@
         },
         input, input_squared, output);
     }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     else if(dt == DataType::F16)
     {
         const float16x8_t coeff_vec    = vdupq_n_f16(_norm_info.scale_coeff());
@@ -268,7 +299,7 @@
         },
         input, input_squared, output);
     }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     else
     {
         ARM_COMPUTE_ERROR("Not supported");
@@ -374,6 +405,14 @@
     }
 }
 
+Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), input_squared->clone().get(), output->clone().get(), norm_info).first);
+
+    return Status{};
+}
+
 void NENormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 19d45e2..c271032 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp

@@ -30,7 +30,6 @@
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include <arm_neon.h>
 #include <climits>
@@ -38,9 +37,9 @@
 #include <cstdint>
 #include <cstdlib>
 
-#if ARM_COMPUTE_ENABLE_FP16
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #include <arm_fp16.h> // needed for float16_t
-#endif                /* ARM_COMPUTE_ENABLE_FP16 */
+#endif                /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 using namespace arm_compute;
 
@@ -55,6 +54,68 @@
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
+inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_UNUSED(overflow_policy);
+    ARM_COMPUTE_UNUSED(rounding_policy);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+                                    "Output can only be U8 if both inputs are U8");
+
+    if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
+    {
+        // Check that all data types are the same and all fixed-point positions are the same
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
+        // Check if scale is representable in fixed-point with the provided settings
+        ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
+    }
+
+    if(std::abs(scale - scale255_constant) < 0.00001f)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
+
+        int         exponent            = 0;
+        const float normalized_mantissa = std::frexp(scale, &exponent);
+
+        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+        // Moreover, it will be negative as we deal with 1/2^n
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
+    }
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
+                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
+                                                    output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
 /* Scales a given vector by 1/255.
  *
  * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
@@ -335,7 +396,7 @@
 template <bool is_scale255, bool is_sat>
 void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     const auto          input1    = static_cast<const float16_t *__restrict>(input1_ptr);
     const auto          input2    = static_cast<const float16_t *__restrict>(input2_ptr);
     const auto          output    = static_cast<float16_t *__restrict>(output_ptr);
@@ -350,13 +411,13 @@
         }
     };
     vst2q_f16(output, result);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(input1_ptr);
     ARM_COMPUTE_UNUSED(input2_ptr);
     ARM_COMPUTE_UNUSED(output_ptr);
     ARM_COMPUTE_UNUSED(scale);
     ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
 template <bool is_scale255, bool is_sat>
@@ -444,6 +505,7 @@
 
 void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
+    ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
     // Auto initialize output if not initialized
@@ -469,19 +531,7 @@
         }
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "Output can only be U8 if both inputs are U8");
-    if(is_data_type_fixed_point(input1->info()->data_type()) || is_data_type_fixed_point(input2->info()->data_type()) || is_data_type_fixed_point(output->info()->data_type()))
-    {
-        // Check that all data types are the same and all fixed-point positions are the same
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
-        // Check if scale is representable in fixed-point with the provided settings
-        ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
 
     _input1         = input1;
     _input2         = input2;
@@ -496,32 +546,17 @@
     // Check and validate scaling factor
     if(std::abs(scale - scale255_constant) < 0.00001f)
     {
-        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_UNUSED(rounding_policy);
-
         is_scale_255 = true;
     }
     else
     {
-        ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
-        ARM_COMPUTE_UNUSED(rounding_policy);
+        int exponent = 0;
 
-        int         exponent            = 0;
-        const float normalized_mantissa = std::frexp(scale, &exponent);
+        std::frexp(scale, &exponent);
 
-        // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
-        // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
-        // Moreover, it will be negative as we deal with 1/2^n
-        if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
-        {
-            // Store the positive exponent. We know that we compute 1/2^n
-            // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
-            _scale_exponent = std::abs(exponent - 1);
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("Scale value not supported (Should be 1/(2^n) or 1/255");
-        }
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        _scale_exponent = std::abs(exponent - 1);
     }
 
     const DataType dt_input1 = input1->info()->data_type();
@@ -621,23 +656,19 @@
         ARM_COMPUTE_ERROR("You called with the wrong img formats");
     }
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
+                                                 RoundingPolicy rounding_policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 8d4e465..47372c2 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -47,13 +47,28 @@
 
 namespace
 {
+void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
+{
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(0, pooled_w);
+    output_shape.set(1, pooled_h);
+
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+}
+
+template <bool exclude_padding>
 inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
-    const int start_x = id.x() * stride_x - pad_x;
-    const int start_y = id.y() * stride_y - pad_y;
+    int       start_x = id.x() * stride_x - pad_x;
+    int       start_y = id.y() * stride_y - pad_y;
     const int end_x   = std::min(start_x + pool_size, upper_bound_w);
     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    if(exclude_padding)
+    {
+        start_x = std::max(0, start_x);
+        start_y = std::max(0, start_y);
+    }
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
@@ -82,65 +97,77 @@
     const int val     = ((end_y - start_y) * (end_x - start_x));
     return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
 }
-} // namespace
 
-NEPoolingLayerKernel::NEPoolingLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
 {
-}
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-BorderSize NEPoolingLayerKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
-{
-    int                 pool_pad_x      = 0;
-    int                 pool_pad_y      = 0;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    PoolingType         pool_type       = pool_info.pool_type();
-    int                 pool_size       = pool_info.pool_size();
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    int                 pool_pad_x        = 0;
+    int                 pool_pad_y        = 0;
+    int                 pool_stride_x     = 0;
+    int                 pool_stride_y     = 0;
+    PoolingType         pool_type         = pool_info.pool_type();
+    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
+    const bool          exclude_padding   = pool_info.exclude_padding();
+    const bool          is_global_pooling = pool_info.is_global_pooling();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
     static const std::set<int> supported_pool_sizes = { 2, 3 };
-    ARM_COMPUTE_UNUSED(supported_pool_sizes);
 
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->info()->data_type()));
-    ARM_COMPUTE_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->info()->data_type() != DataType::F32));
-    ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
-    ARM_COMPUTE_ERROR_ON(is_data_type_fixed_point(input->info()->data_type()) && pool_stride_x > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->data_type() != DataType::F32));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
 
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                                     pool_size, pool_size, pool_info.pad_stride_info());
-
-    // Output auto initialization if not yet initialized
+    if(output->total_size() != 0)
     {
-        TensorShape output_shape{ input->info()->tensor_shape() };
-        output_shape.set(0, pooled_w);
-        output_shape.set(1, pooled_h);
-
-        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h));
     }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+    return Status{};
+}
 
-    unsigned int num_elems_read_per_iteration      = 0;
-    unsigned int num_elems_processed_per_iteration = 0;
-    unsigned int num_elems_horizontal_window       = 0;
+Status validate_arguments_pool_info(const ITensorInfo *input, const PoolingLayerInfo &pool_info, const unsigned int pool_size)
+{
+    const bool is_global_pooling = pool_info.is_global_pooling();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
+                                    "Global pooling is supported only with rectangular inputs!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
+                                    "Invalid pool size and pool pad combination!");
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &num_elems_processed_per_iteration,
+                                                        BorderSize &border_size,
+                                                        unsigned int pooled_w, unsigned int pooled_h, int pool_size)
+{
+    unsigned int        num_elems_read_per_iteration = 0;
+    unsigned int        num_elems_horizontal_window  = 0;
+    int                 pool_pad_x                   = 0;
+    int                 pool_pad_y                   = 0;
+    int                 pool_stride_x                = 0;
+    int                 pool_stride_y                = 0;
+    const int           input_width                  = input->dimension(0);
+    const int           input_height                 = input->dimension(1);
+    const PadStrideInfo pad_stride_info              = pool_info.pad_stride_info();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                     input->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pad_stride_info);
 
     // Select element size
-    switch(input->info()->data_type())
+    switch(input->data_type())
     {
         case DataType::QS8:
             num_elems_read_per_iteration = 16;
@@ -173,7 +200,7 @@
             }
             num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
             break;
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
             switch(pool_size)
             {
@@ -192,7 +219,7 @@
                     break;
             }
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         case DataType::F32:
             switch(pool_size)
             {
@@ -217,19 +244,89 @@
             break;
     }
 
-    _num_elems_processed_per_iteration = num_elems_processed_per_iteration;
-    const int input_width              = input->info()->dimension(0);
-    const int input_height             = input->info()->dimension(1);
-    const int upper_bound_w            = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-    const int upper_bound_h            = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+    border_size         = BorderSize(pool_pad_y, pool_pad_x);
+    border_size.right   = std::max(upper_bound_w, pool_pad_x);
+    border_size.bottom  = std::max(upper_bound_h, pool_pad_y);
+    bool window_changed = false;
+
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(0, pooled_w);
+    output_shape.set(1, pooled_h);
+    TensorInfo output_info(input->clone()->set_tensor_shape(output_shape));
+
+    Window             win = calculate_max_window(output_info, Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_horizontal_window);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEPoolingLayerKernel::NEPoolingLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0)
+{
+}
+
+BorderSize NEPoolingLayerKernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    int                 pool_pad_x        = 0;
+    int                 pool_pad_y        = 0;
+    int                 pool_stride_x     = 0;
+    int                 pool_stride_y     = 0;
+    unsigned int        pooled_w          = 0;
+    unsigned int        pooled_h          = 0;
+    PoolingType         pool_type         = pool_info.pool_type();
+    int                 pool_size         = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
+    const bool          exclude_padding   = pool_info.exclude_padding();
+    const bool          is_global_pooling = pool_info.is_global_pooling();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    // Update pool size in case of global pooling
+    pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
+
+    // Validate pool info before calling scaled_dimensions
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+                                                     input->info()->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pool_info.pad_stride_info());
+
+    // Output auto initialization if not yet initialized
+    auto_init(input->info(), output->info(), pooled_w, pooled_h);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h, pool_size));
 
     // Set instance variables
-    _input              = input;
-    _output             = output;
-    _pool_info          = pool_info;
-    _border_size        = BorderSize(pool_pad_y, pool_pad_x);
-    _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-    _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    _input     = input;
+    _output    = output;
+    _pool_info = pool_info;
 
     // Select appropriate function
     switch(pool_size)
@@ -268,13 +365,13 @@
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling2_f16<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -285,13 +382,13 @@
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -332,13 +429,13 @@
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling3_f16<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -349,13 +446,13 @@
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::AVG, false>;
                         break;
                     case PoolingType::L2:
-                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2>;
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32<PoolingType::L2, false>;
                         break;
                     case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX>;
+                        _func = &NEPoolingLayerKernel::pooling3_f32<PoolingType::MAX, false>;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -366,13 +463,13 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG>;
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
                     break;
                 case PoolingType::L2:
-                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2>;
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX>;
+                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -382,13 +479,13 @@
             switch(pool_type)
             {
                 case PoolingType::AVG:
-                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG>;
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
                     break;
                 case PoolingType::L2:
-                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2>;
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
                     break;
                 case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX>;
+                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Unsupported pooling type!");
@@ -397,12 +494,9 @@
     }
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 template <PoolingType pooling_type>
@@ -533,10 +627,10 @@
     input, output);
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling3_f16(const Window &window_input, const Window &window)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
@@ -547,8 +641,8 @@
     int                 pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
     const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
@@ -572,7 +666,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -595,25 +689,25 @@
         *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
     },
     input, output);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(window_input);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling2_f16(const Window &window_input, const Window &window)
 {
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator      input(_input, window_input);
     Iterator      output(_output, window);
     constexpr int pool_size = 2;
     int           pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
     const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
@@ -635,7 +729,7 @@
 
         if(pooling_type != PoolingType::MAX)
         {
-            const float       scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
             res                       = vmulq_f16(scale_v, vaddq_f16(bottom_data.val[1], vaddq_f16(bottom_data.val[0], vaddq_f16(top_data.val[0], top_data.val[1]))));
         }
@@ -654,14 +748,14 @@
         vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
     },
     input, output);
-#else  /* ARM_COMPUTE_ENABLE_FP16 */
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(window_input);
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_ERROR("FP16 Not supported! Recompile the library with arch=arm64-v8.2-a");
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
@@ -674,8 +768,8 @@
     int           pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
     const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
@@ -697,7 +791,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -868,7 +962,7 @@
     input, output);
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
@@ -881,8 +975,8 @@
     int                 pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
     const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
@@ -907,7 +1001,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -935,7 +1029,7 @@
     input, output);
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::pooling7_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
@@ -948,8 +1042,8 @@
     int                 pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     std::array<const uint8_t *, pool_size> input_ptrs{ {} };
     for(int i = 0; i < pool_size; ++i)
@@ -964,7 +1058,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            float             scale   = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            float             scale   = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
             const float32x2_t scale_v = vdup_n_f32(scale);
 
             // Perform pooling
@@ -1017,21 +1111,21 @@
     input, output);
 }
 
-template <PoolingType pooling_type>
+template <PoolingType pooling_type, bool exclude_padding>
 void NEPoolingLayerKernel::poolingN_f32(const Window &window_input, const Window &window)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
-    const int pool_size     = _pool_info.pool_size();
+    const int pool_size     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
     int       pool_pad_x    = 0;
     int       pool_pad_y    = 0;
     int       pool_stride_x = 0;
     int       pool_stride_y = 0;
     std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x;
-    const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y;
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1040,7 +1134,7 @@
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1138,6 +1232,34 @@
     input, output);
 }
 
+Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+    unsigned int pooled_w                          = 0;
+    unsigned int pooled_h                          = 0;
+    unsigned int num_elems_processed_per_iteration = 0;
+    BorderSize   border_size(0);
+
+    const bool         is_global_pooling = pool_info.is_global_pooling();
+    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
+
+    // Validate pool info befor calling scaled_dimensions
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_pool_info(input, pool_info, pool_size));
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                     input->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pool_info.pad_stride_info());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, pooled_w, pooled_h, pool_size));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info, num_elems_processed_per_iteration, border_size, pooled_w, pooled_h, pool_size).first);
+
+    return Status{};
+}
+
 void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 83004ae..9b8d931 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -68,6 +68,11 @@
 {
 }
 
+BorderSize NERemapKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
 void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -102,15 +107,19 @@
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
+    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
 
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, 1),
+    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access,
                               AccessWindowRectangle(map_x->info(), 0, 0, num_elems_processed_per_iteration, 1),
                               AccessWindowRectangle(map_y->info(), 0, 0, num_elems_processed_per_iteration, 1),
                               output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 8e69252..a0f324e 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -60,7 +60,7 @@
 
 void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
                                                   DataType::U32, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -94,6 +94,7 @@
     {
         case DataType::U8:
         case DataType::S8:
+        case DataType::QASYMM8:
         case DataType::QS8:
             reshape_tensor<uint8_t>(window, _input, _output);
             break;

diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 6634d4b..1918a77 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp

@@ -48,12 +48,15 @@
     return BorderSize(1);
 }
 
-void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined)
+void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined,
+                              SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON(output == input);
+    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(sampling_policy);
 
     if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
@@ -180,8 +183,10 @@
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr);
-                const int offset_row = in_yi * input_stride;
+                const int in_yi         = std::floor((id.y() + 0.5f) * hr);
+                const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
+                ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
+                const int offset_row = in_yi_clamped * input_stride;
 
                 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
                 tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);

diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index 4cc80f8..40a3e31 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp

@@ -286,7 +286,7 @@
                 }
             };
 
-            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
+            const int32x4x2_t out = compute_hor_sobel_y(data_s32);
             vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
             vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
         },

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 648dac4..b13fb0e 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -42,6 +42,149 @@
 
 namespace
 {
+Status validate_arguments_logits_1d_max(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        // Softmax across the x dimension
+        TensorShape output_shape{ input->tensor_shape() };
+        output_shape.set(0, 1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    constexpr unsigned int num_elems_written_per_row = 1;
+    const int              input_width               = input->valid_region().shape.x();
+
+    unsigned int           num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+    Window                 win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = false;
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_row, 1.f / input_width);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments_logits_1d_shift_exp_sum(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, max, sum, output);
+    ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    }
+
+    // Checks performed when sum is configured
+    if(sum->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, sum);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(max, sum);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, max, sum);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_logits_1d_shift_exp_sum(ITensorInfo *input, ITensorInfo *max, ITensorInfo *output, ITensorInfo *sum)
+{
+    unsigned int num_elems_processed_per_iteration = input->valid_region().shape.x();
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal max_access(max, 0, 1);
+    AccessWindowHorizontal sum_access(sum, 0, 1);
+    bool                   window_changed = false;
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, max_access, sum_access);
+    }
+
+    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments_logits_1d_norm(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_logits_1d_norm(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output)
+{
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
+    Window       win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowStatic     sum_access(sum, 0, 0, 1, sum->dimension(1));
+    bool                   window_changed = false;
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+        window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
+
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, input_access, sum_access);
+    }
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
 void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window)
 {
     Window in_slice = window.first_slice_window_1D();
@@ -106,7 +249,7 @@
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void logits_1d_max_f16(const ITensor *in, ITensor *out, const Window &window)
 {
     Window in_slice = window.first_slice_window_1D();
@@ -138,7 +281,7 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window)
 {
@@ -184,8 +327,7 @@
 
 void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Softmax across the x dimension
     TensorShape output_shape{ input->info()->tensor_shape() };
@@ -194,9 +336,8 @@
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(input->info(), output->info()));
 
     const int    input_width                       = input->info()->valid_region().shape.x();
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
@@ -213,10 +354,10 @@
             _func = &logits_1d_max_f32;
             break;
         case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             _func = &logits_1d_max_f16;
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
     }
@@ -226,17 +367,17 @@
     _border_size = BorderSize(0, num_elems_processed_per_iteration - (input_width % num_elems_processed_per_iteration), 0, 0);
 
     // Configure kernel window
-    constexpr unsigned int num_elems_written_per_row = 1;
+    auto win_config = validate_and_configure_window_logits_1d_max(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_row, 1.f / input_width);
+Status NELogits1DMaxKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_max(input->clone().get(), output->clone().get()).first);
 
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NELogits1DMaxKernel::run(const Window &window, const ThreadInfo &info)
@@ -251,8 +392,10 @@
 
 namespace
 {
-void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
 {
+    ARM_COMPUTE_UNUSED(beta);
+
     Window window_max(window);
     window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
 
@@ -313,8 +456,10 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
-void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
 {
+    ARM_COMPUTE_UNUSED(beta);
+
     Window window_max(window);
     window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
 
@@ -374,8 +519,8 @@
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
-void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+void logits_1d_shift_exp_sum_f16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
 {
     Window window_max(window);
     window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -410,6 +555,7 @@
         {
             float16x8_t vec_elements = vld1q_f16(in_ptr);
             vec_elements             = vsubq_f16(vec_elements, vec_max);
+            vec_elements             = vmulq_n_f16(vec_elements, beta);
             vec_elements             = vexpq_f16(vec_elements);
 
             vst1q_f16(exp_ptr, vec_elements);
@@ -426,7 +572,7 @@
         // Run remaining elements
         for(int i = 0; i < small_steps; ++i)
         {
-            const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr));
+            const float16_t element = std::exp(static_cast<float>(in_ptr[i] - *max_ptr) * beta);
             exp_ptr[i]              = element;
             sum += element;
         }
@@ -434,9 +580,9 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice));
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window)
+void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta)
 {
     Window window_max(window);
     window_max.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -471,6 +617,7 @@
         {
             float32x4_t vec_elements = vld1q_f32(in_ptr);
             vec_elements             = vsubq_f32(vec_elements, vec_max);
+            vec_elements             = vmulq_n_f32(vec_elements, beta);
             vec_elements             = vexpq_f32(vec_elements);
 
             vst1q_f32(exp_ptr, vec_elements);
@@ -488,7 +635,7 @@
         // Run remaining elements
         for(int i = 0; i < small_steps; ++i)
         {
-            float element = std::exp(in_ptr[i] - *max_ptr);
+            float element = std::exp((in_ptr[i] - *max_ptr) * beta);
             exp_ptr[i]    = element;
             sum += element;
         }
@@ -500,25 +647,20 @@
 } //namespace
 
 NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel()
-    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+    : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr), _beta(1.0f)
 {
 }
 
-void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum)
+void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, max, sum, output);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-
-    unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x();
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info(), beta));
 
     switch(input->info()->data_type())
     {
@@ -532,10 +674,10 @@
             _func = &logits_1d_shift_exp_sum_f32;
             break;
         case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             _func = &logits_1d_shift_exp_sum_f16;
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
             break;
@@ -545,20 +687,20 @@
     _max    = max;
     _output = output;
     _sum    = sum;
+    _beta   = beta;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max->info(), 0, 1);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+    auto win_config = validate_and_configure_window_logits_1d_shift_exp_sum(input->info(), max->info(), output->info(), sum->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+Status NELogits1DShiftExpSumKernel::validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_shift_exp_sum(input, max, output, sum, beta));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_shift_exp_sum(input->clone().get(), max->clone().get(), output->clone().get(), sum->clone().get()).first);
 
-    output_access.set_valid_region(win, input->info()->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NELogits1DShiftExpSumKernel::run(const Window &window, const ThreadInfo &info)
@@ -568,7 +710,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _max, _output, _sum, window);
+    (*_func)(_input, _max, _output, _sum, window, _beta);
 }
 
 namespace
@@ -637,7 +779,7 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 void logits_1d_norm_f16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
 {
     Window window_sum(window);
@@ -668,7 +810,7 @@
     }
     while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window)
 {
@@ -709,23 +851,18 @@
 
 void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_norm(input->info(), sum->info(), output->info()));
 
     _input  = input;
     _sum    = sum;
     _output = output;
 
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
-
     switch(input->info()->data_type())
     {
         case DataType::QS8:
@@ -738,26 +875,27 @@
             _func = &logits_1d_norm_f32;
             break;
         case DataType::F16:
-#ifdef ARM_COMPUTE_ENABLE_FP16
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             _func = &logits_1d_norm_f16;
             break;
-#endif /* ARM_COMPUTE_ENABLE_FP16 */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Unsupported data type.");
             break;
     }
 
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window_logits_1d_norm(input->info(), sum->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+Status NELogits1DNormKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_norm(input, sum, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_logits_1d_norm(input->clone().get(), sum->clone().get(), output->clone().get()).first);
 
-    update_window_and_padding(win, input_access, sum_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NELogits1DNormKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 1cfaafe..c863ed4 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -23,10 +23,11 @@
  */
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
@@ -40,138 +41,455 @@
 
 namespace
 {
+TensorShape transposed_tensor_shape(const TensorShape &in)
+{
+    TensorShape  output_shape{ in };
+    const size_t w_out = in[1];
+    const size_t h_out = in[0];
+    output_shape.set(0, w_out);
+    output_shape.set(1, h_out);
+
+    return output_shape;
+}
+
+unsigned int num_elems_processed(size_t element_size)
+{
+    switch(element_size)
+    {
+        case 1:
+            return 8;
+            break;
+        case 2:
+            return 4;
+            break;
+        case 4:
+            return 4;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+                                                         DataType::F16,
+                                                         DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Note: This kernel performs 16 elements per iteration.
+    // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
+    // For this reason num_elems_processed_per_iteration_x is set to 1
+    const unsigned int num_elems_processed_per_iteration_x = 1;
+    const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(input->element_size());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1));
+
+    bool window_changed = update_window_and_padding(win, input_access);
+
+    if(output->total_size() != 0)
+    {
+        AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
 void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
+    const int    window_step_x            = 8;
+    const int    window_step_y            = 8;
+    const int    window_start_x           = window.x().start();
+    const int    window_end_x             = window.x().end();
+    const int    window_start_y           = window.y().start();
+    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if(left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if(window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
     Window window_out(window);
     window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    Iterator input(in, window);
     Iterator output(out, window_out);
 
-    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    // Run the NEON path if and only if the input is not a row-vector
+    if(in->info()->dimension(1) != 1)
     {
-        const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 0 * input_stride_in_bytes));
-        const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 1 * input_stride_in_bytes));
-        const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 2 * input_stride_in_bytes));
-        const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 3 * input_stride_in_bytes));
-        const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 4 * input_stride_in_bytes));
-        const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 5 * input_stride_in_bytes));
-        const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 6 * input_stride_in_bytes));
-        const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + 7 * input_stride_in_bytes));
+        Iterator input(in, window_in);
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            // Compute 8x8 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
 
-        // Transpose 2x2
-        const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-        const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-        const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-        const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+                // Transpose 2x2
+                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
 
-        // Transpose 4x4
-        const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-        const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-        const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-        const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+                // Transpose 4x4
+                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
 
-        // Transpose 8x8
-        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-        const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-        const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+                // Transpose 8x8
+                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
 
-        // Compute destination address
-        const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
 
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-        vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-    },
-    input, output);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+            }
+
+            // Compute left-over elements along the x dimension (1x8)
+            for(; x < window_end_x; ++x)
+            {
+                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+
+                uint8x8_t result = vdup_n_u8(0);
+                result           = vset_lane_u8(val0, result, 0);
+                result           = vset_lane_u8(val1, result, 1);
+                result           = vset_lane_u8(val2, result, 2);
+                result           = vset_lane_u8(val3, result, 3);
+                result           = vset_lane_u8(val4, result, 4);
+                result           = vset_lane_u8(val5, result, 5);
+                result           = vset_lane_u8(val6, result, 6);
+                result           = vset_lane_u8(val7, result, 7);
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+
+                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+            }
+        },
+        input, output);
+    }
+
+    if(left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            const uint8_t val0 = *input.ptr();
+
+            // Compute destination address
+            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+
+            *(output.ptr() + dst_offset_in_bytes) = val0;
+        },
+        input, output);
+    }
 }
 
 void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
+    const int    window_step_x            = 4;
+    const int    window_step_y            = 4;
+    const int    window_start_x           = window.x().start();
+    const int    window_end_x             = window.x().end();
+    const int    window_start_y           = window.y().start();
+    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if(left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if(window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
     Window window_out(window);
     window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    Iterator input(in, window);
     Iterator output(out, window_out);
 
-    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    // Run the NEON path if and only if the input is not a row-vector
+    if(in->info()->dimension(1) != 1)
     {
-        const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes));
-        const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes));
-        const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes));
-        const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes));
+        Iterator input(in, window_in);
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            // Compute 4x4 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-        // Transpose 2x2
-        const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-        const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+                // Transpose 2x2
+                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
 
-        // Transpose 4x4
-        const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-        const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+                // Transpose 4x4
+                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
 
-        // Compute destination address
-        const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
 
-        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-        vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-    },
-    input, output);
+                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
+                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
+                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
+                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
+            }
+
+            // Compute left-over elements (1x4)
+            for(; x < window_end_x; ++x)
+            {
+                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                uint16x4_t result = vdup_n_u16(0);
+                result            = vset_lane_u16(val0, result, 0);
+                result            = vset_lane_u16(val1, result, 1);
+                result            = vset_lane_u16(val2, result, 2);
+                result            = vset_lane_u16(val3, result, 3);
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+
+                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+            }
+        },
+        input, output);
+    }
+
+    if(left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+
+            // Compute destination address
+            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+
+            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+        },
+        input, output);
+    }
 }
 
 void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
+    const int    window_step_x            = 4;
+    const int    window_step_y            = 4;
+    const int    window_start_x           = window.x().start();
+    const int    window_end_x             = window.x().end();
+    const int    window_start_y           = window.y().start();
+    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+
+    // Check if we need a left-over loop for the y dimension
+    bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
+
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+    if(left_over_loop_y)
+    {
+        // Check if window_end_y_multiple_of is greater than window_start_y
+        if(window_end_y_multiple_of > window_start_y)
+        {
+            window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
+        }
+        else
+        {
+            window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
+        }
+    }
+
     Window window_out(window);
     window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    Iterator input(in, window);
     Iterator output(out, window_out);
 
-    const size_t input_stride_in_bytes  = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    // Run the NEON path if and only if the input is not a row-vector
+    if(in->info()->dimension(1) != 1)
     {
-        const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes));
-        const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes));
-        const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes));
-        const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes));
+        Iterator input(in, window_in);
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            // Compute 4x4 elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-        // Transpose 2x2
-        const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-        const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-        const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-        const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+                // Transpose 2x2
+                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
 
-        // Compute destination address
-        const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
 
-        // Swap block 01 with block 10 and store
-        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-        vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-    },
-    input, output);
+                // Swap block 01 with block 10 and store
+                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+            }
+
+            // Compute left-over elements (1x4)
+            for(; x < window_end_x; ++x)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+
+                uint32x4_t result = vdupq_n_u32(0);
+                result            = vsetq_lane_u32(val0, result, 0);
+                result            = vsetq_lane_u32(val1, result, 1);
+                result            = vsetq_lane_u32(val2, result, 2);
+                result            = vsetq_lane_u32(val3, result, 3);
+
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+
+                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+            }
+        },
+        input, output);
+    }
+
+    if(left_over_loop_y)
+    {
+        window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
+        window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
+
+        Iterator input(in, window_in);
+        Iterator output(out, window_out);
+
+        // Compute left-over elements along the y dimension (1x1)
+        execute_window_loop(window_in, [&](const Coordinates & id)
+        {
+            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+
+            // Compute destination address
+            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+
+            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+        },
+        input, output);
+    }
 }
 } // namespace
 
+Status NETransposeKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
 NETransposeKernel::NETransposeKernel()
     : _func(nullptr), _input(nullptr), _output(nullptr)
 {
@@ -179,41 +497,26 @@
 
 void NETransposeKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t w_out = input->info()->dimension(1);
-    const size_t h_out = input->info()->dimension(0);
-    output_shape.set(0, w_out);
-    output_shape.set(1, h_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    unsigned int num_elems_processed_per_iteration = 0;
-
     switch(input->info()->element_size())
     {
         case 1:
-            _func                             = &transpose_8bit_elements;
-            num_elems_processed_per_iteration = 8;
+            _func = &transpose_8bit_elements;
             break;
         case 2:
-            _func                             = &transpose_16bit_elements;
-            num_elems_processed_per_iteration = 4;
+            _func = &transpose_16bit_elements;
             break;
         case 4:
-            _func                             = &transpose_32bit_elements;
-            num_elems_processed_per_iteration = 4;
+            _func = &transpose_32bit_elements;
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -221,16 +524,9 @@
     }
 
     // Configure kernel window
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
-    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 void NETransposeKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
new file mode 100644
index 0000000..fe63336
--- /dev/null
+++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp

@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "support/ToolchainSupport.h"
+
+#include "src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp"
+
+using T = winograd_shim_nchw::Winograd2x2_3x3GEMM<float, float>;
+
+namespace arm_compute
+{
+class Winograd3x3F32::Private
+{
+public:
+    Private(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage)
+        : convolver(kernel_shape, input_shape, padding_type, kernel_storage)
+    {
+    }
+
+    T convolver;
+};
+
+Winograd3x3F32::~Winograd3x3F32()
+{
+}
+
+void Winograd3x3F32::nchw2nhwc(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, const void *const input)
+{
+    _pimpl->convolver.nchw2nhwc(input_shape, padding_type, working_space, reinterpret_cast<const float *>(input));
+}
+
+void Winograd3x3F32::nhwc2nchw(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, void *const output)
+{
+    _pimpl->convolver.nhwc2nchw(input_shape, padding_type, working_space, reinterpret_cast<float *const>(output));
+}
+
+void Winograd3x3F32::transform_weights(const void *const kernel, void *transform_working_space)
+{
+    _pimpl->convolver.transform_weights(reinterpret_cast<const float *>(kernel), transform_working_space);
+}
+
+void Winograd3x3F32::reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const void *const input, void *working_space)
+{
+    _pimpl->convolver.reshape_input(input_shape, padding_type, reinterpret_cast<const float *>(input), working_space);
+}
+
+void Winograd3x3F32::reshape_output(const Tensor4DShape &input_shape, const PaddingType padding_type, void *const output)
+{
+#if defined(__aarch64__)
+    _pimpl->convolver.reshape_output(input_shape, padding_type, reinterpret_cast<float *const>(output));
+#else  /* __aarch64__ */
+    ARM_COMPUTE_UNUSED(input_shape);
+    ARM_COMPUTE_UNUSED(padding_type);
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_ERROR("Not implemented");
+#endif /* __aarch64__ */
+}
+
+std::pair<void *, void *> Winograd3x3F32::get_nhwc_ptrs(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space)
+{
+    return _pimpl->convolver.get_nhwc_ptrs(input_shape, padding_type, working_space);
+}
+
+Winograd3x3F32::Winograd3x3F32(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage)
+    : _pimpl(support::cpp14::make_unique<Private>(kernel_shape, input_shape, padding_type, kernel_storage))
+{
+}
+
+size_t NEWinogradLayerKernel::get_kernel_storage_size(const KernelShape &shape)
+{
+    return T::get_kernel_storage_size(shape);
+}
+
+size_t NEWinogradLayerKernel::get_working_space_size(const Tensor4DShape &input_shape, const KernelShape &k_shape, const PaddingType padding)
+{
+    return T::get_working_space_size(input_shape, k_shape, padding);
+}
+
+size_t NEWinogradLayerKernel::get_kernel_transform_working_size(const KernelShape &shape)
+{
+    return T::get_kernel_transform_working_size(shape);
+}
+
+NEWinogradLayerKernel::NEWinogradLayerKernel()
+    : _convolver(nullptr), _output(nullptr)
+{
+}
+
+void NEWinogradLayerKernel::configure(ITensor *output, Winograd3x3F32 *convolver)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    _convolver = convolver;
+    Window win = calculate_max_window(*output->info());
+    INEKernel::configure(win);
+}
+
+void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(info.num_threads < 1);
+    const size_t tid                  = info.thread_id;
+    const size_t num_threads          = std::min(info.num_threads, 16);
+    const size_t num_gemms_per_thread = 16 / num_threads;
+    const size_t first_gemm           = tid * num_gemms_per_thread;
+    const size_t last_gemm            = (tid == (num_threads - 1)) ? 15 : first_gemm + num_gemms_per_thread - 1;
+    _convolver->_pimpl->convolver.execute(first_gemm, last_gemm);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
new file mode 100644
index 0000000..e020cd9
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp

@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+// Enable only if compiled for AArch64-V8A targets
+#ifdef ARM_COMPUTE_AARCH64_V8A
+
+namespace arm_compute
+{
+NEGEMMLowpAArch64A53Kernel::NEGEMMLowpAArch64A53Kernel()
+    : _func(nullptr)
+{
+}
+
+void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                               const ThreadInfo &info)
+{
+    const int lda = input0->info()->strides_in_bytes().y();
+    const int ldb = input1->info()->strides_in_bytes().y();
+    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
+
+    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
+
+    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = output->info()->tensor_shape().x();
+    const int K = input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(input0, window);
+    Iterator out(output, window);
+
+    GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *_workspace     = workspace->buffer() + offset;
+    size_t           workspace_size = workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
+                     reinterpret_cast<const int8_t *>(in1_ptr), ldb,
+                     reinterpret_cast<int32_t *>(out.ptr()), ldc,
+                     alpha, beta, _workspace);
+    },
+    in0, out);
+}
+
+void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                               const ThreadInfo &info)
+{
+    const int lda = input0->info()->strides_in_bytes().y();
+    const int ldb = input1->info()->strides_in_bytes().y();
+    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
+
+    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
+
+    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = output->info()->tensor_shape().x();
+    const int K = input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(input0, window);
+    Iterator out(output, window);
+
+    GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *_workspace     = workspace->buffer() + offset;
+    size_t           workspace_size = workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
+                     reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
+                     reinterpret_cast<uint32_t *>(out.ptr()), ldc,
+                     alpha, beta, _workspace);
+    },
+    in0, out);
+}
+
+void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    switch(input0->info()->data_type())
+    {
+        case DataType::S8:
+            _func = &gemm_interleaved_s16_12x8;
+            break;
+        case DataType::U8:
+            _func = &gemm_interleaved_u16_12x8;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 12);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpAArch64A53Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+}
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
new file mode 100644
index 0000000..db37201
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp

@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+// Enable only if compiled for AArch64-V8A targets
+#ifdef ARM_COMPUTE_AARCH64_V8A
+
+namespace arm_compute
+{
+NEGEMMLowpAArch64Kernel::NEGEMMLowpAArch64Kernel()
+    : _func(nullptr)
+{
+}
+
+void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                         const ThreadInfo &info)
+{
+    const int lda = input0->info()->strides_in_bytes().y();
+    const int ldb = input1->info()->strides_in_bytes().y();
+    const int ldc = output->info()->strides_in_bytes().y() / sizeof(int32_t);
+
+    const auto in1_ptr = reinterpret_cast<const int8_t *>(input1->buffer());
+
+    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = output->info()->tensor_shape().x();
+    const int K = input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(input0, window);
+    Iterator out(output, window);
+
+    GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *_workspace     = workspace->buffer() + offset;
+    size_t           workspace_size = workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const int8_t *>(in0.ptr()), lda,
+                     reinterpret_cast<const int8_t *>(in1_ptr), ldb,
+                     reinterpret_cast<int32_t *>(out.ptr()), ldc,
+                     alpha, beta, _workspace);
+    },
+    in0, out);
+}
+
+void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+                         const ThreadInfo &info)
+{
+    const int lda = input0->info()->strides_in_bytes().y();
+    const int ldb = input1->info()->strides_in_bytes().y();
+    const int ldc = output->info()->strides_in_bytes().y() / sizeof(uint32_t);
+
+    const auto in1_ptr = reinterpret_cast<const uint8_t *>(input1->buffer());
+
+    const int M = std::min(output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = output->info()->tensor_shape().x();
+    const int K = input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(input0, window);
+    Iterator out(output, window);
+
+    GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *_workspace     = workspace->buffer() + offset;
+    size_t           workspace_size = workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), _workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const uint8_t *>(in0.ptr()), lda,
+                     reinterpret_cast<const uint8_t *>(in1_ptr), ldb,
+                     reinterpret_cast<uint32_t *>(out.ptr()), ldc,
+                     alpha, beta, _workspace);
+    },
+    in0, out);
+}
+
+void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    switch(input0->info()->data_type())
+    {
+        case DataType::S8:
+            _func = &gemm_interleaved_s8;
+            break;
+        case DataType::U8:
+            _func = &gemm_interleaved_u8;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 4, 4);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 4);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 4);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEGEMMLowpAArch64Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+}
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
new file mode 100644
index 0000000..e996e57
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp

@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+// Enable only if compiled for AArch64-V8.2-A targets
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+
+namespace
+{
+using namespace arm_compute;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*output);
+
+    AccessWindowRectangle output_access(output, 0, 0, 12, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
+    const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
+                                                    AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
+                                                    output_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+namespace arm_compute
+{
+void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y();
+    const int ldb = _input1->info()->strides_in_bytes().y();
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t);
+
+    const auto in1_ptr = reinterpret_cast<const gemm_u8_12x8::operand_type *>(_input1->buffer());
+
+    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_1, !_transform_1);
+
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const gemm_u8_12x8::operand_type *>(in0.ptr()), lda,
+                     reinterpret_cast<const gemm_u8_12x8::operand_type *>(in1_ptr), ldb,
+                     reinterpret_cast<gemm_u8_12x8::result_type *>(out.ptr()), ldc,
+                     _alpha, _beta, workspace);
+    },
+    in0, out);
+}
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */

diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
new file mode 100644
index 0000000..2256304
--- /dev/null
+++ b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
+} // namespace arm_compute
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+namespace arm_compute
+{
+void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+
+    _input0      = input0;
+    _input1      = input1;
+    _output      = output;
+    _workspace   = workspace;
+    _alpha       = alpha;
+    _beta        = beta;
+    _transform_0 = transform_0;
+    _transform_1 = transform_1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info());
+
+    AccessWindowRectangle output_access(output->info(), 0, 0, 24, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
+    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 24);
+
+    update_window_and_padding(win,
+                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
+                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
+                              output_access);
+
+    INEKernel::configure(win);
+}
+
+void NEHGEMMAArch64FP16Kernel::run(const Window &window, const ThreadInfo &info)
+{
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const int lda = _input0->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
+    const int ldb = _input1->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::operand_type);
+    const int ldc = _output->info()->strides_in_bytes().y() / sizeof(hgemm_24x8::result_type);
+
+    const auto in1_ptr = reinterpret_cast<const hgemm_24x8::operand_type *>(_input1->buffer());
+
+    const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
+    const int N = _output->info()->tensor_shape().x();
+    const int K = _input0->info()->tensor_shape().x();
+
+    // Only iterate over batches
+    Window win(window);
+    win.set(0, Window::Dimension(0, 1, 1));
+    win.set(1, Window::Dimension(0, 1, 1));
+
+    Iterator in0(_input0, window);
+    Iterator out(_output, window);
+
+    GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = _workspace->buffer() + offset;
+    size_t           workspace_size = _workspace->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const hgemm_24x8::operand_type *>(in0.ptr()), lda,
+                     reinterpret_cast<const hgemm_24x8::operand_type *>(in1_ptr), ldb,
+                     reinterpret_cast<hgemm_24x8::result_type *>(out.ptr()), ldc,
+                     _alpha, 1.f, workspace);
+    },
+    in0, out);
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/winograd/gemm.hpp b/src/core/NEON/kernels/winograd/gemm.hpp
new file mode 100644
index 0000000..111e196
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/gemm.hpp

@@ -0,0 +1,127 @@
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include "utils.hpp"
+
+template <typename TIn, typename TOut>
+void Gemm(const TIn* const a, const TIn* const b, TOut *c,
+          const int M, const int K, const int N,
+          const int a_row_stride,
+          const int b_row_stride,
+          const int c_row_stride,
+          const bool a_transposed=false,
+          const bool b_transposed=false) {
+  // Array access methods
+  const auto A = [a, a_transposed, M, K, a_row_stride] (const int i, const int j) -> TIn {
+    return a[(!a_transposed) ? i*a_row_stride + j : i + j*M];
+  };
+
+  const auto B = [b, b_transposed, K, N, b_row_stride] (const int i, const int j) -> TIn {
+    return b[(!b_transposed) ? i*b_row_stride + j : i + j*N];
+  };
+
+  const auto C = [c, c_row_stride] (const int i, const int j) -> TOut& {
+    return c[i*c_row_stride + j];
+  };
+
+  // Perform the matrix multiplication
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      for (int k = 0; k < K; k++) {
+        C(i, j) += A(i, k) * B(k, j);
+      }
+    }
+  }
+}
+
+template <const int M_BLOCK, const int N_BLOCK, typename TIn, typename TOut>
+void BlockedGemm(
+  const TIn* const a, const TIn* const b, TOut *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  // Array access methods
+  const auto A = [a, a_row_stride] (const int i, const int j) -> TIn {
+    return a[i*a_row_stride + j];
+  };
+
+  const auto B = [b, b_row_stride] (const int i, const int j) -> TIn {
+    return b[i*b_row_stride + j];
+  };
+
+  const auto C = [c, c_row_stride] (const int i, const int j) -> TOut& {
+    return c[i*c_row_stride + j];
+  };
+
+  const int M_BLOCKS = iceildiv(M, M_BLOCK);
+  const int N_BLOCKS = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < M_BLOCKS; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < N_BLOCKS; nblock++) {
+      // Create an appropriately sized block of accumulators
+      TOut accum[M_BLOCK][N_BLOCK];
+      for (int i = 0; i < M_BLOCK; i++) {
+        for (int j = 0; j < N_BLOCK; j++) {
+          accum[i][j] = static_cast<TOut>(0);
+        }
+      }
+
+      // Perform this portion of the matrix multiply
+      for (int k = 0; k < K; k++) {
+        // Load elements of A
+        TIn elems_a[M_BLOCK];
+        for (int i = 0; i < M_BLOCK; i++) {
+          elems_a[i] = A(mblock*M_BLOCK + i, k);
+        }
+
+        // Load elements of B
+        TIn elems_b[N_BLOCK];
+        for (int j = 0; j < N_BLOCK; j++) {
+          elems_b[j] = B(k, nblock*N_BLOCK + j);
+        }
+
+        // Perform the partial matrix multiply
+        for (int i = 0; i < M_BLOCK; i++) {
+          for (int j = 0; j < N_BLOCK; j++) {
+            accum[i][j] += elems_a[i] * elems_b[j];
+          }
+        }
+      }
+
+      // Store the partial product
+      for (int i = 0; i < M_BLOCK; i++) {
+        for (int j = 0; j < N_BLOCK; j++) {
+          C(mblock*M_BLOCK + i, nblock*N_BLOCK + j) = accum[i][j];
+        }
+      }
+    }
+  }
+}
+
+#include "gemm/a64_sgemm.hpp"

diff --git a/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp b/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp
new file mode 100644
index 0000000..e1b7488
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp

@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include <cassert>
+#include "../utils.hpp"
+
+#ifdef __aarch64__
+
+template <>
+inline void BlockedGemm<8, 12, float, float>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  const int M_BLOCK = 8;
+  const int N_BLOCK = 12;
+
+  const int m_blocks = iceildiv(M, M_BLOCK);
+  const int n_blocks = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < m_blocks; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < n_blocks; nblock++) {
+      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
+      const float *bptr = b + nblock*N_BLOCK;
+      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
+      int k = K;
+
+      asm volatile (
+          // Create an 8x12 block of accumulators
+          " A_1 .req v27\n"
+          "sA_1 .req s27\n"
+          " A_2 .req v28\n"
+          "sA_2 .req s28\n"
+          " A_3 .req v29\n"
+          "sA_3 .req s29\n"
+          " A_4 .req v30\n"
+          "sA_4 .req s30\n"
+
+          " B_1 .req v24\n" " B_2 .req v25\n" " B_3 .req v26\n"
+          "qB_1 .req q24\n" "qB_2 .req q25\n" "qB_3 .req q26\n"
+
+          " C_11 .req  v0\n" " C_12 .req  v1\n" " C_13 .req  v2\n"
+          " C_21 .req  v3\n" " C_22 .req  v4\n" " C_23 .req  v5\n"
+          " C_31 .req  v6\n" " C_32 .req  v7\n" " C_33 .req  v8\n"
+          " C_41 .req  v9\n" " C_42 .req v10\n" " C_43 .req v11\n"
+          " C_51 .req v12\n" " C_52 .req v13\n" " C_53 .req v14\n"
+          " C_61 .req v15\n" " C_62 .req v16\n" " C_63 .req v17\n"
+          " C_71 .req v18\n" " C_72 .req v19\n" " C_73 .req v20\n"
+          " C_81 .req v21\n" " C_82 .req v22\n" " C_83 .req v23\n"
+
+          "qC_11 .req  q0\n" "qC_12 .req  q1\n" "qC_13 .req  q2\n"
+          "qC_21 .req  q3\n" "qC_22 .req  q4\n" "qC_23 .req  q5\n"
+          "qC_31 .req  q6\n" "qC_32 .req  q7\n" "qC_33 .req  q8\n"
+          "qC_41 .req  q9\n" "qC_42 .req q10\n" "qC_43 .req q11\n"
+          "qC_51 .req q12\n" "qC_52 .req q13\n" "qC_53 .req q14\n"
+          "qC_61 .req q15\n" "qC_62 .req q16\n" "qC_63 .req q17\n"
+          "qC_71 .req q18\n" "qC_72 .req q19\n" "qC_73 .req q20\n"
+          "qC_81 .req q21\n" "qC_82 .req q22\n" "qC_83 .req q23\n"
+
+          "aptr1 .req x17\n"
+          "aptr2 .req x18\n"
+          "aptr3 .req x19\n"
+          "aptr4 .req x20\n"
+          "aptr5 .req x21\n"
+          "aptr6 .req x22\n"
+          "aptr7 .req x23\n"
+
+          // Initialise accumulators with 0
+          // Initialise pointers
+          "movi C_11.4s, #0\n"
+          "add aptr1, %x[aptr], %x[a_row_stride]\n"
+          "movi C_12.4s, #0\n"
+          "add aptr2,    aptr1, %x[a_row_stride]\n"
+          "movi C_13.4s, #0\n"
+          "add aptr3,    aptr2, %x[a_row_stride]\n"
+          "movi C_21.4s, #0\n"
+          "add aptr4,    aptr3, %x[a_row_stride]\n"
+          "movi C_22.4s, #0\n"
+          "add aptr5,    aptr4, %x[a_row_stride]\n"
+          "movi C_23.4s, #0\n"
+          "add aptr6,    aptr5, %x[a_row_stride]\n"
+          "movi C_31.4s, #0\n"
+          "add aptr7,    aptr6, %x[a_row_stride]\n"
+          "movi C_32.4s, #0\n"
+          "ldr qB_1, [%x[bptr]]\n"
+          "movi C_33.4s, #0\n"
+          "ldr qB_2, [%x[bptr], #0x10]\n"
+          "movi C_41.4s, #0\n"
+          "prfm pldl1keep, [%x[bptr], #0x00]\n"
+          "movi C_42.4s, #0\n"
+          "prfm pldl1keep, [%x[bptr], #0x10]\n"
+          "movi C_43.4s, #0\n"
+          "prfm pldl1keep, [%x[bptr], #0x20]\n"
+          "movi C_51.4s, #0\n"
+          "prfm pldl1keep, [%x[aptr], #0x00]\n"
+          "movi C_52.4s, #0\n"
+          "prfm pldl1keep, [   aptr1, #0x00]\n"
+          "movi C_53.4s, #0\n"
+          "prfm pldl1keep, [   aptr2, #0x00]\n"
+          "movi C_61.4s, #0\n"
+          "prfm pldl1keep, [   aptr3, #0x00]\n"
+          "movi C_62.4s, #0\n"
+          "prfm pldl1keep, [   aptr4, #0x00]\n"
+          "movi C_63.4s, #0\n"
+          "prfm pldl1keep, [   aptr5, #0x00]\n"
+          "movi C_71.4s, #0\n"
+          "prfm pldl1keep, [   aptr6, #0x00]\n"
+          "movi C_72.4s, #0\n"
+          "prfm pldl1keep, [   aptr7, #0x00]\n"
+          "movi C_73.4s, #0\n"
+          "ldr sA_1, [%x[aptr]], #0x4\n"
+          "movi C_81.4s, #0\n"
+          "ldr sA_2, [   aptr1], #0x4\n"
+          "movi C_82.4s, #0\n"
+          "ldr sA_3, [   aptr2], #0x4\n"
+          "movi C_83.4s, #0\n"
+          "subs %x[k], %x[k], #1\n"
+          "beq 2f\n"
+
+          "1:"
+            "fmla C_11.4s, B_1.4s, A_1.s[0]\n"
+            "ldr qB_3, [%x[bptr], #0x20]\n"
+            "fmla C_12.4s, B_2.4s, A_1.s[0]\n"
+            "ldr sA_4, [   aptr3], #0x4\n"
+            "fmla C_13.4s, B_3.4s, A_1.s[0]\n"
+            "ldr sA_1, [   aptr4], #0x04\n"
+
+            "fmla C_21.4s, B_1.4s, A_2.s[0]\n"
+            "add %x[bptr], %x[bptr], %x[b_row_stride]\n"
+            "fmla C_22.4s, B_2.4s, A_2.s[0]\n"
+            "prfm pldl1keep, [   aptr3, #0x10]\n"
+            "fmla C_23.4s, B_3.4s, A_2.s[0]\n"
+            "ldr sA_2, [   aptr5], #0x04\n"
+
+            "fmla C_31.4s, B_1.4s, A_3.s[0]\n"
+            "prfm pldl1keep, [%x[bptr], #0x00]\n"
+            "fmla C_32.4s, B_2.4s, A_3.s[0]\n"
+            "prfm pldl1keep, [%x[bptr], #0x10]\n"
+            "fmla C_33.4s, B_3.4s, A_3.s[0]\n"
+            "ldr sA_3, [   aptr6], #0x04\n"
+
+            "fmla C_41.4s, B_1.4s, A_4.s[0]\n"
+            "prfm pldl1keep, [%x[bptr], #0x20]\n"
+            "fmla C_42.4s, B_2.4s, A_4.s[0]\n"
+            "prfm pldl1keep, [   aptr4, #0x10]\n"
+            "fmla C_43.4s, B_3.4s, A_4.s[0]\n"
+            "ldr sA_4, [   aptr7], #0x04\n"
+
+            "fmla C_51.4s, B_1.4s, A_1.s[0]\n"
+            "prfm pldl1keep, [   aptr5, #0x10]\n"
+            "fmla C_52.4s, B_2.4s, A_1.s[0]\n"
+            "prfm pldl1keep, [   aptr6, #0x10]\n"
+            "fmla C_53.4s, B_3.4s, A_1.s[0]\n"
+            "ldr sA_1, [%x[aptr]], #0x04\n"
+
+            "fmla C_61.4s, B_1.4s, A_2.s[0]\n"
+            "prfm pldl1keep, [   aptr7, #0x10]\n"
+            "fmla C_62.4s, B_2.4s, A_2.s[0]\n"
+            "subs %x[k], %x[k], #1\n"
+            "fmla C_63.4s, B_3.4s, A_2.s[0]\n"
+            "ldr sA_2, [   aptr1], #0x04\n"
+
+            "fmla C_71.4s, B_1.4s, A_3.s[0]\n"
+            "prfm pldl1keep, [%x[aptr], #0x10]\n"
+            "fmla C_72.4s, B_2.4s, A_3.s[0]\n"
+            "prfm pldl1keep, [   aptr1, #0x10]\n"
+            "fmla C_73.4s, B_3.4s, A_3.s[0]\n"
+            "ldr sA_3, [   aptr2], #0x04\n"
+
+            "fmla C_81.4s, B_1.4s, A_4.s[0]\n"
+            "prfm pldl1keep, [   aptr2, #0x10]\n"
+            "fmla C_82.4s, B_2.4s, A_4.s[0]\n"
+            "ldp qB_1, qB_2, [%x[bptr]]\n"
+            "fmla C_83.4s, B_3.4s, A_4.s[0]\n"
+            "bne 1b\n"
+
+          "2:"
+            "fmla C_11.4s, B_1.4s, A_1.s[0]\n"
+            "ldr qB_3, [%x[bptr], #0x20]\n"
+            "fmla C_12.4s, B_2.4s, A_1.s[0]\n"
+            "stp qC_11, qC_12, [%x[cptr]]\n"
+            "fmla C_13.4s, B_3.4s, A_1.s[0]\n"
+            "str qC_13, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+            "ldr sA_1, [   aptr4], #0x04\n"
+
+            "fmla C_21.4s, B_1.4s, A_2.s[0]\n"
+            "ldr sA_4, [   aptr3], #0x4\n"
+            "fmla C_22.4s, B_2.4s, A_2.s[0]\n"
+            "stp qC_21, qC_22, [%x[cptr]]\n"
+            "fmla C_23.4s, B_3.4s, A_2.s[0]\n"
+            "str qC_23, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+            "ldr sA_2, [   aptr5], #0x04\n"
+
+            "fmla C_31.4s, B_1.4s, A_3.s[0]\n"
+            "fmla C_32.4s, B_2.4s, A_3.s[0]\n"
+            "stp qC_31, qC_32, [%x[cptr]]\n"
+            "fmla C_33.4s, B_3.4s, A_3.s[0]\n"
+            "str qC_33, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+            "ldr sA_3, [   aptr6], #0x04\n"
+
+            "fmla C_41.4s, B_1.4s, A_4.s[0]\n"
+            "fmla C_42.4s, B_2.4s, A_4.s[0]\n"
+            "stp qC_41, qC_42, [%x[cptr]]\n"
+            "fmla C_43.4s, B_3.4s, A_4.s[0]\n"
+            "str qC_43, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+            "ldr sA_4, [   aptr7], #0x04\n"
+
+            "fmla C_51.4s, B_1.4s, A_1.s[0]\n"
+            "fmla C_52.4s, B_2.4s, A_1.s[0]\n"
+            "stp qC_51, qC_52, [%x[cptr]]\n"
+            "fmla C_53.4s, B_3.4s, A_1.s[0]\n"
+            "str qC_53, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+
+            "fmla C_61.4s, B_1.4s, A_2.s[0]\n"
+            "fmla C_62.4s, B_2.4s, A_2.s[0]\n"
+            "stp qC_61, qC_62, [%x[cptr]]\n"
+            "fmla C_63.4s, B_3.4s, A_2.s[0]\n"
+            "str qC_63, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+
+            "fmla C_71.4s, B_1.4s, A_3.s[0]\n"
+            "fmla C_72.4s, B_2.4s, A_3.s[0]\n"
+            "stp qC_71, qC_72, [%x[cptr]]\n"
+            "fmla C_73.4s, B_3.4s, A_3.s[0]\n"
+            "str qC_73, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+
+            "fmla C_81.4s, B_1.4s, A_4.s[0]\n"
+            "fmla C_82.4s, B_2.4s, A_4.s[0]\n"
+            "stp qC_81, qC_82, [%x[cptr]]\n"
+            "fmla C_83.4s, B_3.4s, A_4.s[0]\n"
+            "str qC_83, [%x[cptr], #0x20]\n"
+            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
+
+          // Clear aliases
+          ".unreq aptr1\n"
+          ".unreq aptr2\n"
+          ".unreq aptr3\n"
+          ".unreq aptr4\n"
+          ".unreq aptr5\n"
+          ".unreq aptr6\n"
+          ".unreq aptr7\n"
+
+          ".unreq  A_1\n" ".unreq  A_2\n" ".unreq  A_3\n" ".unreq  A_4\n"
+          ".unreq sA_1\n" ".unreq sA_2\n" ".unreq sA_3\n" ".unreq sA_4\n"
+
+          ".unreq  B_1\n" ".unreq  B_2\n" ".unreq  B_3\n"
+          ".unreq qB_1\n" ".unreq qB_2\n" ".unreq qB_3\n"
+
+          ".unreq C_11\n" ".unreq C_12\n" ".unreq C_13\n"
+          ".unreq C_21\n" ".unreq C_22\n" ".unreq C_23\n"
+          ".unreq C_31\n" ".unreq C_32\n" ".unreq C_33\n"
+          ".unreq C_41\n" ".unreq C_42\n" ".unreq C_43\n"
+          ".unreq C_51\n" ".unreq C_52\n" ".unreq C_53\n"
+          ".unreq C_61\n" ".unreq C_62\n" ".unreq C_63\n"
+          ".unreq C_71\n" ".unreq C_72\n" ".unreq C_73\n"
+          ".unreq C_81\n" ".unreq C_82\n" ".unreq C_83\n"
+
+          ".unreq qC_11\n" ".unreq qC_12\n" ".unreq qC_13\n"
+          ".unreq qC_21\n" ".unreq qC_22\n" ".unreq qC_23\n"
+          ".unreq qC_31\n" ".unreq qC_32\n" ".unreq qC_33\n"
+          ".unreq qC_41\n" ".unreq qC_42\n" ".unreq qC_43\n"
+          ".unreq qC_51\n" ".unreq qC_52\n" ".unreq qC_53\n"
+          ".unreq qC_61\n" ".unreq qC_62\n" ".unreq qC_63\n"
+          ".unreq qC_71\n" ".unreq qC_72\n" ".unreq qC_73\n"
+          ".unreq qC_81\n" ".unreq qC_82\n" ".unreq qC_83\n"
+          : [aptr] "+r" (aptr),
+            [bptr] "+r" (bptr),
+            [cptr] "+r" (cptr),
+            [k] "+r" (k)
+          : [a_row_stride] "r" (a_row_stride * sizeof(float)),
+            [b_row_stride] "r" (b_row_stride * sizeof(float)),
+            [c_row_stride] "r" (c_row_stride * sizeof(float))
+          : "cc", "memory",
+            "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+            "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
+            "v29", "v30", "x17", "x18", "x19", "x20", "x21", "x22", "x23"
+      );
+    }
+  }
+}
+
+/*****************************************************************************/
+/* 4x16 blocked GEMM with specialised tails
+ */
+#include "a64_sgemm_4x16.hpp"
+
+template <>
+inline void BlockedGemm<4, 16, float, float>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  // Despatch based on tail of K
+  switch (K % 4) {
+    case 3:
+      sgemm_4x16_impl<3>(
+        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
+      );
+      break;
+    case 2:
+      sgemm_4x16_impl<2>(
+        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
+      );
+      break;
+    case 1:
+      sgemm_4x16_impl<1>(
+        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
+      );
+      break;
+    case 0:
+      sgemm_4x16_impl<0>(
+        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
+      );
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp b/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp
new file mode 100644
index 0000000..e74610e
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp

@@ -0,0 +1,1445 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+template <const unsigned int tail>
+inline void sgemm_4x16_impl(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+);
+
+template <>
+inline void sgemm_4x16_impl<0>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  const int TAIL_SIZE = 0;
+  const int M_BLOCK = 4;
+  const int N_BLOCK = 16;
+
+  const int m_blocks = iceildiv(M, M_BLOCK);
+  const int n_blocks = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < m_blocks; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < n_blocks; nblock++) {
+      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
+      const float *bptr = b + nblock*N_BLOCK;
+      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
+      int k = (K - TAIL_SIZE) / 4;
+
+      asm volatile(
+        "aptr2 .req X20\n"
+        "aptr3 .req X21\n"
+        "aptr4 .req X22\n"
+        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
+        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
+        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
+        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
+        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
+        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
+        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
+        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
+        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
+        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
+        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
+        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
+        "vB1 .req v20\n" "qB1 .req q20\n"
+        "vB2 .req v21\n" "qB2 .req q21\n"
+        "vB3 .req v22\n" "qB3 .req q22\n"
+        "vB4 .req v23\n" "qB4 .req q23\n"
+
+        // Clear accumulators, initialise pointers
+        "movi vC11.4s, #0\n"
+        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
+        "movi vC12.4s, #0\n"
+        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
+        "movi vC13.4s, #0\n"
+        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
+        "movi vC14.4s, #0\n"
+        "ldr qA1, [%x[aptr]], #0x10\n"
+        "movi vC21.4s, #0\n"
+        "ldr qA2, [   aptr2], #0x10\n"
+        "movi vC22.4s, #0\n"
+        "ldr qB1, [%x[bptr], #0x00]\n"
+        "movi vC23.4s, #0\n"
+        "ldr qB2, [%x[bptr], #0x10]\n"
+        "movi vC24.4s, #0\n"
+        "ldr qB3, [%x[bptr], #0x20]\n"
+        "movi vC31.4s, #0\n"
+        "movi vC32.4s, #0\n"
+        "movi vC33.4s, #0\n"
+        "movi vC34.4s, #0\n"
+        "movi vC41.4s, #0\n"
+        "movi vC42.4s, #0\n"
+        "movi vC43.4s, #0\n"
+        "movi vC44.4s, #0\n"
+        "subs %x[k], %x[k], #1\n"
+        "beq 2f\n"
+
+        "1:"  // Loop proper
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "subs %x[k], %x[k], #1\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+          "bne 1b\n"
+
+        "2:"  // Tail
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "stp qC11, qC12, [%x[cptr], #0x00]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "stp qC13, qC14, [%x[cptr], #0x20]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "stp qC21, qC22, [%x[cptr], #0x00]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "stp qC23, qC24, [%x[cptr], #0x20]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "stp qC31, qC32, [%x[cptr], #0x00]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "stp qC33, qC34, [%x[cptr], #0x20]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "stp qC41, qC42, [%x[cptr], #0x00]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+          "stp qC43, qC44, [%x[cptr], #0x20]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+
+        ".unreq vB4\n" ".unreq qB4\n"
+        ".unreq vB3\n" ".unreq qB3\n"
+        ".unreq vB2\n" ".unreq qB2\n"
+        ".unreq vB1\n" ".unreq qB1\n"
+        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
+        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
+        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
+        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
+        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
+        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
+        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
+        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
+        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
+        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
+        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
+        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
+        ".unreq aptr2\n"
+        ".unreq aptr3\n"
+        ".unreq aptr4\n"
+
+        : [aptr] "+r" (aptr),
+          [bptr] "+r" (bptr),
+          [cptr] "+r" (cptr),
+          [k] "+r" (k)
+        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
+          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
+          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
+        : "cc", "memory", "x20", "x21", "x22",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+      );
+    }
+  }
+}
+
+template <>
+inline void sgemm_4x16_impl<1>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  const int TAIL_SIZE = 1;
+  const int M_BLOCK = 4;
+  const int N_BLOCK = 16;
+
+  const int m_blocks = iceildiv(M, M_BLOCK);
+  const int n_blocks = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < m_blocks; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < n_blocks; nblock++) {
+      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
+      const float *bptr = b + nblock*N_BLOCK;
+      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
+      int k = (K - TAIL_SIZE) / 4;
+
+      asm volatile(
+        "aptr2 .req X20\n"
+        "aptr3 .req X21\n"
+        "aptr4 .req X22\n"
+        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
+        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
+        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
+        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
+        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
+        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
+        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
+        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
+        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
+        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
+        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
+        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
+        "vB1 .req v20\n" "qB1 .req q20\n"
+        "vB2 .req v21\n" "qB2 .req q21\n"
+        "vB3 .req v22\n" "qB3 .req q22\n"
+        "vB4 .req v23\n" "qB4 .req q23\n"
+
+        // Clear accumulators, initialise pointers
+        "movi vC11.4s, #0\n"
+        "ldr qB1, [%x[bptr], #0x00]\n"
+        "movi vC12.4s, #0\n"
+        "ldr qB2, [%x[bptr], #0x10]\n"
+        "movi vC13.4s, #0\n"
+        "ldr qB3, [%x[bptr], #0x20]\n"
+        "movi vC14.4s, #0\n"
+        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
+        "movi vC21.4s, #0\n"
+        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
+        "movi vC22.4s, #0\n"
+        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
+        "movi vC23.4s, #0\n"
+        "cbnz %x[k], 3f\n"
+
+        // Prepare for tail in K
+        "movi vC24.4s, #0\n"
+        "ldr sA1, [%x[aptr]], #0x04\n"
+        "movi vC31.4s, #0\n"
+        "ldr sA2, [   aptr2], #0x04\n"
+        "movi vC32.4s, #0\n"
+        "movi vC33.4s, #0\n"
+        "movi vC34.4s, #0\n"
+        "movi vC41.4s, #0\n"
+        "movi vC42.4s, #0\n"
+        "movi vC43.4s, #0\n"
+        "movi vC44.4s, #0\n"
+        "b 2f\n"  // Jump to tail
+
+        "3:"  // Prepare for loop over K
+          "movi vC24.4s, #0\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "movi vC31.4s, #0\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "movi vC32.4s, #0\n"
+          "movi vC33.4s, #0\n"
+          "movi vC34.4s, #0\n"
+          "movi vC41.4s, #0\n"
+          "movi vC42.4s, #0\n"
+          "movi vC43.4s, #0\n"
+          "movi vC44.4s, #0\n"
+          "subs %x[k], %x[k], #1\n"
+          "beq 4f\n"
+
+        "1:"  // Loop proper
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "subs %x[k], %x[k], #1\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+          "bne 1b\n"
+
+        "4:"  // Tail iteration
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr sA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr sA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+
+        "2:"  // Common tail
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "stp qC11, qC12, [%x[cptr], #0x00]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "ldr sA3, [   aptr3], #0x10\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "stp qC13, qC14, [%x[cptr], #0x20]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "stp qC21, qC22, [%x[cptr], #0x00]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "ldr sA4, [   aptr4], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "stp qC23, qC24, [%x[cptr], #0x20]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "stp qC31, qC32, [%x[cptr], #0x00]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "stp qC33, qC34, [%x[cptr], #0x20]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "stp qC41, qC42, [%x[cptr], #0x00]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+          "stp qC43, qC44, [%x[cptr], #0x20]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+
+        ".unreq vB4\n" ".unreq qB4\n"
+        ".unreq vB3\n" ".unreq qB3\n"
+        ".unreq vB2\n" ".unreq qB2\n"
+        ".unreq vB1\n" ".unreq qB1\n"
+        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
+        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
+        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
+        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
+        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
+        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
+        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
+        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
+        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
+        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
+        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
+        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
+        ".unreq aptr2\n"
+        ".unreq aptr3\n"
+        ".unreq aptr4\n"
+
+        : [aptr] "+r" (aptr),
+          [bptr] "+r" (bptr),
+          [cptr] "+r" (cptr),
+          [k] "+r" (k)
+        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
+          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
+          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
+        : "cc", "memory", "x20", "x21", "x22",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+      );
+    }
+  }
+}
+
+template <>
+inline void sgemm_4x16_impl<2>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  const int TAIL_SIZE = 2;
+  const int M_BLOCK = 4;
+  const int N_BLOCK = 16;
+
+  const int m_blocks = iceildiv(M, M_BLOCK);
+  const int n_blocks = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < m_blocks; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < n_blocks; nblock++) {
+      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
+      const float *bptr = b + nblock*N_BLOCK;
+      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
+      int k = (K - TAIL_SIZE) / 4;
+
+      asm volatile(
+        "aptr2 .req X20\n"
+        "aptr3 .req X21\n"
+        "aptr4 .req X22\n"
+        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
+        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
+        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
+        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
+        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
+        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
+        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
+        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
+        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
+        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
+        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
+        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
+        "vB1 .req v20\n" "qB1 .req q20\n"
+        "vB2 .req v21\n" "qB2 .req q21\n"
+        "vB3 .req v22\n" "qB3 .req q22\n"
+        "vB4 .req v23\n" "qB4 .req q23\n"
+
+        // Clear accumulators, initialise pointers
+        "movi vC11.4s, #0\n"
+        "ldr qB1, [%x[bptr], #0x00]\n"
+        "movi vC12.4s, #0\n"
+        "ldr qB2, [%x[bptr], #0x10]\n"
+        "movi vC13.4s, #0\n"
+        "ldr qB3, [%x[bptr], #0x20]\n"
+        "movi vC14.4s, #0\n"
+        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
+        "movi vC21.4s, #0\n"
+        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
+        "movi vC22.4s, #0\n"
+        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
+        "movi vC23.4s, #0\n"
+        "cbnz %x[k], 3f\n"
+
+        // Prepare for tail in K
+        "movi vC24.4s, #0\n"
+        "ldr dA1, [%x[aptr]], #0x08\n"
+        "movi vC31.4s, #0\n"
+        "ldr dA2, [   aptr2], #0x08\n"
+        "movi vC32.4s, #0\n"
+        "movi vC33.4s, #0\n"
+        "movi vC34.4s, #0\n"
+        "movi vC41.4s, #0\n"
+        "movi vC42.4s, #0\n"
+        "movi vC43.4s, #0\n"
+        "movi vC44.4s, #0\n"
+        "b 2f\n"  // Jump to tail
+
+        "3:"  // Prepare for loop over K
+          "movi vC24.4s, #0\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "movi vC31.4s, #0\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "movi vC32.4s, #0\n"
+          "movi vC33.4s, #0\n"
+          "movi vC34.4s, #0\n"
+          "movi vC41.4s, #0\n"
+          "movi vC42.4s, #0\n"
+          "movi vC43.4s, #0\n"
+          "movi vC44.4s, #0\n"
+          "subs %x[k], %x[k], #1\n"
+          "beq 4f\n"
+
+        "1:"  // Loop proper
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "subs %x[k], %x[k], #1\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+          "bne 1b\n"
+
+        "4:"  // Tail iteration
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr dA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr dA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+
+        "2:"  // Common tail
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr dA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr dA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "stp qC11, qC12, [%x[cptr], #0x00]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "stp qC13, qC14, [%x[cptr], #0x20]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "stp qC21, qC22, [%x[cptr], #0x00]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "stp qC23, qC24, [%x[cptr], #0x20]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "stp qC31, qC32, [%x[cptr], #0x00]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "stp qC33, qC34, [%x[cptr], #0x20]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "stp qC41, qC42, [%x[cptr], #0x00]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+          "stp qC43, qC44, [%x[cptr], #0x20]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+
+        ".unreq vB4\n" ".unreq qB4\n"
+        ".unreq vB3\n" ".unreq qB3\n"
+        ".unreq vB2\n" ".unreq qB2\n"
+        ".unreq vB1\n" ".unreq qB1\n"
+        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
+        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
+        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
+        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
+        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
+        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
+        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
+        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
+        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
+        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
+        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
+        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
+        ".unreq aptr2\n"
+        ".unreq aptr3\n"
+        ".unreq aptr4\n"
+
+        : [aptr] "+r" (aptr),
+          [bptr] "+r" (bptr),
+          [cptr] "+r" (cptr),
+          [k] "+r" (k)
+        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
+          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
+          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
+        : "cc", "memory", "x20", "x21", "x22",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+      );
+    }
+  }
+}
+
+template <>
+inline void sgemm_4x16_impl<3>(
+  const float* const a, const float* const b, float *c,
+  const int M, const int K, const int N,
+  const int a_row_stride,
+  const int b_row_stride,
+  const int c_row_stride
+) {
+  const int TAIL_SIZE = 3;
+  const int M_BLOCK = 4;
+  const int N_BLOCK = 16;
+
+  const int m_blocks = iceildiv(M, M_BLOCK);
+  const int n_blocks = iceildiv(N, N_BLOCK);
+
+  // For each block of output rows
+  for (int mblock = 0; mblock < m_blocks; mblock++) {
+    // For each block of output columns
+    for (int nblock = 0; nblock < n_blocks; nblock++) {
+      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
+      const float *bptr = b + nblock*N_BLOCK;
+      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
+      int k = (K - TAIL_SIZE) / 4;
+
+      asm volatile(
+        "aptr2 .req X20\n"
+        "aptr3 .req X21\n"
+        "aptr4 .req X22\n"
+        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
+        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
+        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
+        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
+        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
+        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
+        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
+        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
+        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
+        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
+        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
+        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
+        "vB1 .req v20\n" "qB1 .req q20\n"
+        "vB2 .req v21\n" "qB2 .req q21\n"
+        "vB3 .req v22\n" "qB3 .req q22\n"
+        "vB4 .req v23\n" "qB4 .req q23\n"
+
+        // Clear accumulators, initialise pointers
+        "movi vC11.4s, #0\n"
+        "ldr qB1, [%x[bptr], #0x00]\n"
+        "movi vC12.4s, #0\n"
+        "ldr qB2, [%x[bptr], #0x10]\n"
+        "movi vC13.4s, #0\n"
+        "ldr qB3, [%x[bptr], #0x20]\n"
+        "movi vC14.4s, #0\n"
+        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
+        "movi vC21.4s, #0\n"
+        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
+        "movi vC22.4s, #0\n"
+        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
+        "movi vC23.4s, #0\n"
+        "cbnz %x[k], 3f\n"
+
+        // Prepare for tail in K
+        "movi vC24.4s, #0\n"
+        "ldr dA1, [%x[aptr]], #0x08\n"
+        "movi vC31.4s, #0\n"
+        "ldr dA2, [   aptr2], #0x08\n"
+        "movi vC32.4s, #0\n"
+        "movi vC33.4s, #0\n"
+        "movi vC34.4s, #0\n"
+        "movi vC41.4s, #0\n"
+        "movi vC42.4s, #0\n"
+        "movi vC43.4s, #0\n"
+        "movi vC44.4s, #0\n"
+        "b 2f\n"  // Jump to tail
+
+        "3:"  // Prepare for loop over K
+          "movi vC24.4s, #0\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "movi vC31.4s, #0\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "movi vC32.4s, #0\n"
+          "movi vC33.4s, #0\n"
+          "movi vC34.4s, #0\n"
+          "movi vC41.4s, #0\n"
+          "movi vC42.4s, #0\n"
+          "movi vC43.4s, #0\n"
+          "movi vC44.4s, #0\n"
+          "subs %x[k], %x[k], #1\n"
+          "beq 4f\n"
+
+        "1:"  // Loop proper
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "subs %x[k], %x[k], #1\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr qA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr qA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+          "bne 1b\n"
+
+        "4:"  // Tail iteration
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr qA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
+          "ldr dA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
+          "ldr dA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
+
+        "2:"  // Common tail
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr dA3, [   aptr3], #0x10\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "ldr dA4, [   aptr4], #0x10\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
+          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
+          "ldr qB1, [%x[bptr], #0x00]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
+          "ldr qB2, [%x[bptr], #0x10]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
+          "ldr sA1, [%x[aptr]], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
+          "ldr sA2, [   aptr2], #0x10\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
+          "ldr qB3, [%x[bptr], #0x20]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
+
+          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
+          "ldr qB4, [%x[bptr], #0x30]\n"
+          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
+          "stp qC11, qC12, [%x[cptr], #0x00]\n"
+          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
+          "ldr sA3, [   aptr3], #0x10\n"
+          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
+          "stp qC13, qC14, [%x[cptr], #0x20]\n"
+          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
+          "stp qC21, qC22, [%x[cptr], #0x00]\n"
+          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
+          "ldr sA4, [   aptr4], #0x10\n"
+          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
+          "stp qC23, qC24, [%x[cptr], #0x20]\n"
+          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
+          "stp qC31, qC32, [%x[cptr], #0x00]\n"
+          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
+          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
+          "stp qC33, qC34, [%x[cptr], #0x20]\n"
+          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
+          "stp qC41, qC42, [%x[cptr], #0x00]\n"
+          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
+          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
+          "stp qC43, qC44, [%x[cptr], #0x20]\n"
+          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
+
+        ".unreq vB4\n" ".unreq qB4\n"
+        ".unreq vB3\n" ".unreq qB3\n"
+        ".unreq vB2\n" ".unreq qB2\n"
+        ".unreq vB1\n" ".unreq qB1\n"
+        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
+        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
+        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
+        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
+        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
+        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
+        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
+        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
+        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
+        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
+        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
+        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
+        ".unreq aptr2\n"
+        ".unreq aptr3\n"
+        ".unreq aptr4\n"
+
+        : [aptr] "+r" (aptr),
+          [bptr] "+r" (bptr),
+          [cptr] "+r" (cptr),
+          [k] "+r" (k)
+        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
+          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
+          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
+        : "cc", "memory", "x20", "x21", "x22",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+      );
+    }
+  }
+}

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/NEON/kernels/winograd/perf.h
similarity index 67%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/NEON/kernels/winograd/perf.h
index 37857b6..11fb0c4 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/NEON/kernels/winograd/perf.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#pragma once
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
+/* Prototypes from perf.c */
 
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
+void start_counter(int fd);
+long long get_counter(int fd);
+long long stop_counter(int fd);
+int open_instruction_counter(void);
+int open_cycle_counter(void);

diff --git a/src/core/NEON/kernels/winograd/profiler.hpp b/src/core/NEON/kernels/winograd/profiler.hpp
new file mode 100644
index 0000000..143192b
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/profiler.hpp

@@ -0,0 +1,244 @@
+
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include <map>
+#include <vector>
+
+#include "perf.h"
+#include <unistd.h>
+
+class profiler {
+private:
+#ifdef CYCLE_PROFILING
+    struct ProfileEntry {
+      int event_id;
+      long int bytes_read, ops, bytes_written;
+      long int duration;
+    };
+
+    static const int maxevents = 10000;
+    ProfileEntry events[maxevents];
+    int currentevent;
+    int countfd;
+
+    std::map<const char *, int> event_ids;
+
+    int get_event_id(const char *id) {
+      if (!event_ids.count(id)) {
+        event_ids.emplace(id, event_ids.size());
+      }
+      return event_ids[id];
+    }
+#endif  // CYCLE_PROFILING
+
+public:
+#ifdef CYCLE_PROFILING
+    profiler() {
+        currentevent = 0;
+        countfd = open_cycle_counter();
+    }
+
+    ~profiler() {
+        close(countfd);
+
+        // Compute performance from recorded events
+        struct ProfileResult {
+          ProfileResult() : total_calls(0),
+                            total_duration(0),
+                            total_bytes_read(0),
+                            total_ops(0),
+                            total_bytes_written(0) {
+          }
+
+          void operator+=(const ProfileEntry &rhs) {
+            total_calls++;
+            total_duration += rhs.duration;
+            total_bytes_read += rhs.bytes_read;
+            total_ops += rhs.ops;
+            total_bytes_written = rhs.bytes_written;
+          }
+
+          float avg_duration(void) const {
+            return static_cast<float>(total_duration) /
+                   static_cast<float>(total_calls);
+          }
+
+          float bytes_read_per_cycle(void) const {
+            return static_cast<float>(total_bytes_read) /
+                   static_cast<float>(total_duration);
+          }
+
+          float ops_per_cycle(void) const {
+            return static_cast<float>(total_ops) /
+                   static_cast<float>(total_duration);
+          }
+
+          float bytes_written_per_cycle(void) const {
+            return static_cast<float>(total_bytes_written) /
+                   static_cast<float>(total_duration);
+          }
+
+          long int total_calls,
+                   total_duration,
+                   total_bytes_read,
+                   total_ops,
+                   total_bytes_written;
+        };
+
+        std::vector<ProfileResult> totals;
+        totals.resize(event_ids.size());
+        for (int i = 0; i < currentevent; i++) {
+          const auto &event = events[i];
+          totals[event.event_id] += event;
+        }
+
+        // Get the longest label
+        int len_label = 0;
+        for (const auto &kv : event_ids) {
+          len_label = std::max(len_label, static_cast<int>(strlen(kv.first)));
+        }
+
+        // Get the longest values for every other field
+        const auto get_length_of_field =
+          [totals] (const char *title, auto f, auto len) -> size_t {
+            size_t l = strlen(title);
+            for (const auto &v : totals) {
+              l = std::max(l, len(f(v)));
+            }
+            return l;
+        };
+
+        // Get the strlen for an int
+        const auto intlen = [] (long int x) -> size_t {
+          size_t len = 0;
+          do {
+            x /= 10;
+            len++;
+          } while (x);
+          return len;
+        };
+
+        // Get the strlen for a float
+        const auto floatlen = [] (const int precision) {
+          return [precision] (float x) {
+            size_t len = 0;
+
+            if (!std::isfinite(x)) {
+              return static_cast<size_t>(3);
+            }
+
+            do {
+              x /= 10.0f;
+              len++;
+            } while (x > 1.0f);
+            return len + 1 + precision;
+          };
+        };
+
+        const int len_calls = get_length_of_field(
+            "Calls", [] (const auto &v) {return v.total_calls;},
+            intlen
+        );
+        const int len_duration = get_length_of_field(
+            "Duration", [] (const auto &v) {return v.total_duration;},
+            intlen
+        );
+        const int len_average_duration = get_length_of_field(
+            "Average", [] (const auto &v) {return v.avg_duration();},
+            floatlen(2)
+        );
+        const int len_reads_per_cycle = get_length_of_field(
+            "Reads / cycle",
+            [] (const auto &v) {return v.bytes_read_per_cycle();},
+            floatlen(6)
+        );
+        const int len_ops_per_cycle = get_length_of_field(
+            "Ops / cycle",
+            [] (const auto &v) {return v.ops_per_cycle();},
+            floatlen(6)
+        );
+        const int len_writes_per_cycle = get_length_of_field(
+            "Writes / cycle",
+            [] (const auto &v) {return v.bytes_written_per_cycle();},
+            floatlen(6)
+        );
+
+        // Print header
+        printf(
+          "%*s    %*s    %*s    %*s    %*s    %*s    %*s\n",
+          len_label, "",
+          len_calls, "Calls",
+          len_duration, "Duration",
+          len_average_duration, "Average",
+          len_reads_per_cycle, "Reads / cycle",
+          len_ops_per_cycle, "Ops / cycle",
+          len_writes_per_cycle, "Writes / cycle"
+        );
+        for (const auto &kv : event_ids) {
+          const auto id = kv.second;
+          printf(
+            "%*s    %*ld    %*ld    %*.2f    %*.6f    %*.6f    %*.6f\n",
+            len_label, kv.first,
+            len_calls, totals[id].total_calls,
+            len_duration, totals[id].total_duration,
+            len_average_duration, totals[id].avg_duration(),
+            len_reads_per_cycle, totals[id].bytes_read_per_cycle(),
+            len_ops_per_cycle, totals[id].ops_per_cycle(),
+            len_writes_per_cycle, totals[id].bytes_written_per_cycle()
+          );
+        }
+        printf("\n");
+    }
+#endif  // CYCLE_PROFILING
+
+    template <typename T>
+    void operator() (const char * event,
+                     T func,
+                     long int bytes_read = 0,
+                     long int ops = 0,
+                     long int bytes_written = 0) {
+#ifdef CYCLE_PROFILING
+        if (currentevent==maxevents) {
+            func();
+        } else {
+            start_counter(countfd);
+            func();
+            long long cycs = stop_counter(countfd);
+
+            // Store the profiling data
+            events[currentevent++] = {
+              get_event_id(event), bytes_read, ops, bytes_written, cycs
+            };
+        }
+#else
+      func();
+#endif  // CYCLE_PROFILING
+    }
+};

diff --git a/src/core/NEON/kernels/winograd/shims.hpp b/src/core/NEON/kernels/winograd/shims.hpp
new file mode 100644
index 0000000..249e575
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/shims.hpp

@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+/** Re-order a weight tensor from [Output feature map x Input feature map x
+ *  Height x Width] format to [Height x Width x Input feature map x Output
+ *  feature map] format.
+ */
+template <typename T>
+inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
+  const T* const in,  // Input in [Output x Input x Height x Width] form
+  T* const out,       // Output in [Height x Width x Input x Output] form
+  const int n_output_feature_maps,
+  const int n_input_feature_maps,
+  const int n_rows,
+  const int n_cols,
+  int in_output_feature_map_stride=0,
+  int in_input_feature_map_stride=0,
+  int in_row_stride=0,
+  int out_row_stride=0,
+  int out_col_stride=0,
+  int out_input_feature_map_stride=0
+);
+
+/** Re-order a weight tensor from [Height x Width x Input feature map x Output
+ *  feature map] format to [Output feature map x Input feature map x Height x
+ *  Width] format.
+ */
+template <typename T>
+inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
+  const T* const in,  // Input in [Height x Width x Input x Output] form
+  T* const out,       // Output in [Output x Input x Height x Width] form
+  const int n_rows,
+  const int n_cols,
+  const int n_input_feature_maps,
+  const int n_output_feature_maps,
+  int in_row_stride=0,
+  int in_col_stride=0,
+  int in_input_feature_map_stride=0,
+  int out_output_feature_map_stride=0,
+  int out_input_feature_map_stride=0,
+  int out_row_stride=0
+);
+
+
+/* Re-order a tensor from NCHW format to NHWC.
+ */
+template <typename T>
+inline void nchw_to_nhwc(
+  const T* const in,
+  T* const out,
+  const int n_batches,
+  const int n_channels,
+  const int n_rows,
+  const int n_cols,
+  int in_batch_stride=0,
+  int in_channel_stride=0,
+  int in_row_stride=0,
+  int out_batch_stride=0,
+  int out_row_stride=0,
+  int out_col_stride=0
+)
+{
+  // Fill in the stride values
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
+  in_channel_stride = (in_channel_stride) ? in_channel_stride
+                                          : n_rows * in_row_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_channels * in_channel_stride;
+
+  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_rows * out_row_stride;
+
+  // Perform the re-ordering
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_row = in_batch + i*in_row_stride;
+      T* const out_row = out_batch + i*out_row_stride;
+
+      for (int j = 0; j < n_cols; j++)
+      {
+        const T* const in_col = in_row + j;
+        T* const out_col = out_row + j*out_col_stride;
+
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_col + c*in_channel_stride;
+          out_col[c] = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+/* Re-order a tensor from NHWC format to NCHW.
+ */
+template <typename T>
+inline void nhwc_to_nchw(
+  const T* const in,  // Input data in NHWC form
+  T* const out,       // Output data in NCHW form
+  const int n_batches,
+  const int n_rows,
+  const int n_cols,
+  const int n_channels,
+  int in_batch_stride=0,
+  int in_row_stride=0,
+  int in_col_stride=0,
+  int out_batch_stride=0,
+  int out_channel_stride=0,
+  int out_row_stride=0
+)
+{
+  // Fill in stride values
+  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
+  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
+  in_batch_stride = (in_batch_stride) ? in_batch_stride
+                                      : n_rows * in_row_stride;
+
+  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
+  out_channel_stride = (out_channel_stride) ? out_channel_stride
+                                            : n_rows * out_row_stride;
+  out_batch_stride = (out_batch_stride) ? out_batch_stride
+                                        : n_channels * out_channel_stride;
+
+  // Perform the re-ordering
+  // For every batch
+  for (int n = 0; n < n_batches; n++)
+  {
+    const T* const in_batch = in + n*in_batch_stride;
+    T* const out_batch = out + n*out_batch_stride;
+
+    // For every row
+    for (int i = 0; i < n_rows; i++)
+    {
+      const T* const in_i = in_batch + i*in_row_stride;
+      T* const out_i = out_batch + i*out_row_stride;
+
+      // For every column
+      for (int j = 0; j < n_cols; j++)
+      {
+        const T* const in_j = in_i + j*in_col_stride;
+        T* const out_j = out_i + j;
+
+        // For every channel
+        for (int c = 0; c < n_channels; c++)
+        {
+          const T* const in_channel = in_j + c;
+          T* const out_channel = out_j + c*out_channel_stride;
+          *(out_channel) = *(in_channel);
+        }
+      }
+    }
+  }
+}
+
+
+/*****************************************************************************/
+/* Generic weight re-order implementation.
+ */
+template <typename T>
+inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
+  const T* const in,  // Input in [Output x Input x Height x Width] form
+  T* const out,       // Output in [Height x Width x Input x Output] form
+  const int n_output_feature_maps,
+  const int n_input_feature_maps,
+  const int n_rows,
+  const int n_cols,
+  int in_output_feature_map_stride,
+  int in_input_feature_map_stride,
+  int in_row_stride,
+  int out_row_stride,
+  int out_col_stride,
+  int out_input_feature_map_stride
+)
+{
+  // Fill in stride values
+  in_row_stride = (in_row_stride)
+    ? in_row_stride
+    : n_cols;
+  in_input_feature_map_stride = (in_input_feature_map_stride)
+    ? in_input_feature_map_stride
+    : n_rows * in_row_stride;
+  in_output_feature_map_stride = (in_output_feature_map_stride)
+    ? in_output_feature_map_stride
+    : n_input_feature_maps * in_input_feature_map_stride;
+
+  out_input_feature_map_stride = (out_input_feature_map_stride)
+    ? out_input_feature_map_stride
+    : n_output_feature_maps;
+  out_col_stride = (out_col_stride)
+    ? out_col_stride
+    : n_input_feature_maps * out_input_feature_map_stride;
+  out_row_stride = (out_row_stride)
+    ? out_row_stride
+    : n_cols * out_col_stride;
+
+  // Perform the re-ordering
+  for (int i = 0; i < n_rows; i++)
+  {
+    const T* const in_row = in + i * in_row_stride;
+    T* out_row = out + i * out_row_stride;
+
+    for (int j = 0; j < n_cols; j++)
+    {
+      const T* const in_col = in_row + j;
+      T* const out_col = out_row + j * out_col_stride;
+
+      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
+      {
+        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
+        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
+
+        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
+        {
+          const T* const in_ofm = in_ifm + ofm * in_output_feature_map_stride;
+          T* const out_ofm = out_ifm + ofm;
+          *(out_ofm) = *(in_ofm);
+        }
+      }
+    }
+  }
+}
+
+/*****************************************************************************/
+/* Generic weight re-order implementation.
+ */
+template <typename T>
+inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
+  const T* const in,  // Input in [Height x Width x Input x Output] form
+  T* const out,       // Output in [Output x Input x Height x Width] form
+  const int n_rows,
+  const int n_cols,
+  const int n_input_feature_maps,
+  const int n_output_feature_maps,
+  int in_row_stride,
+  int in_col_stride,
+  int in_input_feature_map_stride,
+  int out_output_feature_map_stride,
+  int out_input_feature_map_stride,
+  int out_row_stride
+)
+{
+  // Fill in the stride values
+  in_input_feature_map_stride = (in_input_feature_map_stride)
+    ? in_input_feature_map_stride
+    : n_output_feature_maps;
+  in_col_stride = (in_col_stride)
+    ? in_col_stride
+    : n_input_feature_maps * in_input_feature_map_stride;
+  in_row_stride = (in_row_stride)
+    ? in_row_stride
+    : n_cols * in_col_stride;
+
+  out_row_stride = (out_row_stride)
+    ? out_row_stride
+    : n_cols;
+  out_input_feature_map_stride = (out_input_feature_map_stride)
+    ? out_input_feature_map_stride
+    : n_rows * out_row_stride;
+  out_output_feature_map_stride = (out_output_feature_map_stride)
+    ? out_output_feature_map_stride
+    : n_input_feature_maps * out_input_feature_map_stride;
+
+  // Perform the re-ordering
+  for (int i = 0; i < n_rows; i++)
+  {
+    const T* const in_row = in + i * in_row_stride;
+    T* const out_row = out + i * out_row_stride;
+
+    for (int j = 0; j < n_cols; j++)
+    {
+      const T* const in_col = in_row + j * in_col_stride;
+      T* const out_col = out_row + j;
+
+      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
+      {
+        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
+        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
+
+        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
+        {
+          const T* const in_ofm = in_ifm + ofm;
+          T* const out_ofm = out_ifm + ofm * out_output_feature_map_stride;
+          *(out_ofm) = *(in_ofm);
+        }
+      }
+    }
+  }
+}
+

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/NEON/kernels/winograd/transforms.hpp
similarity index 67%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/NEON/kernels/winograd/transforms.hpp
index 37857b6..8546ee9 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/NEON/kernels/winograd/transforms.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
+#pragma once
 
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
-{
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
+#include "transforms/input_2x2_3x3.hpp"
+#include "transforms/kernel_2x2_3x3.hpp"
+#include "transforms/output_2x2_3x3.hpp"

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp
new file mode 100644
index 0000000..ca8d012
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp

@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include "arm_compute/core/NEON/kernels/winograd/tensor.hpp"
+
+
+namespace winograd {
+  /* Transform an input tensor into the Winograd domain.
+   */
+  template <typename T>
+  struct Winograd2x2_3x3GemmInput {
+    static void execute(
+        const T *inptr,
+        const Tensor4DShape& input_shape,
+        const PaddingType padding_type,
+        const int tile_M,
+        const int tile_N,
+        T *outptr_base,
+        const int matrix_stride,
+        const int matrix_batch_stride,
+        const int matrix_row_stride
+    );
+
+    static size_t bytes_read(const Tensor4DShape &input_shape,
+                           const Tensor4DShape &output_shape) {
+      const int tile_rows = iceildiv(output_shape.n_rows, 2);
+      const int tile_cols = iceildiv(output_shape.n_cols, 2);
+      return input_shape.n_batches * tile_rows * (16 + 8*(tile_cols - 1)) * input_shape.n_channels * sizeof(T);
+    }
+
+    static int flops_performed(const Tensor4DShape &input_shape,
+                                const Tensor4DShape &output_shape) {
+      const int tile_rows = iceildiv(output_shape.n_rows, 2);
+      const int tile_cols = iceildiv(output_shape.n_cols, 2);
+      return input_shape.n_batches * tile_rows * (32 + 24*(tile_cols - 1)) * input_shape.n_channels;
+    }
+
+    static size_t bytes_written(const Tensor4DShape &input_shape,
+                              const Tensor4DShape &output_shape) {
+      const int tile_rows = iceildiv(output_shape.n_rows, 2);
+      const int tile_cols = iceildiv(output_shape.n_cols, 2);
+      const int M = input_shape.n_batches * tile_rows * tile_cols;
+      return 16 * M * input_shape.n_channels * sizeof(T);
+    }
+
+    protected:
+    template <const PaddingType padding, const int pad_bottom, const int pad_right>
+    static void process_tile_tensor(
+        const int tile_M,      // Number of rows of tiles
+        const int tile_N,      // Number of columns of tiles
+        int n_channels,  // Number of input channels
+        const T* const input,  // Base input pointer (appropriate to batch and channel)
+        const int input_row_stride,  // Stride between rows of the input
+        const int input_col_stride,  // Stride between columns of the input
+        T* const matrix,              // 1st output matrix (appropriate to batch and channel)
+        const int matrix_stride,      // Stride between matrices
+        const int matrix_row_stride   // Stride between rows of the output matrix
+    );
+
+    template <const int pad_top, const int pad_left,
+              const int pad_bottom, const int pad_right,
+              const int proc_channels>
+    static void process_tile_row(
+        const int tile_N,      // Number of tiles in the row
+        const T* const input,  // Base input pointer (appropriate to batch, channel and row)
+        const int input_row_stride,  // Stride between rows of the input
+        const int input_col_stride,  // Stride between columns of the input
+        T* const matrix,              // 1st output matrix (appropriate to batch, channel and row)
+        const int matrix_stride,      // Stride between matrices
+        const int matrix_row_stride   // Stride between rows of the output matrix
+    );
+  };
+
+  template <typename T>
+  struct Winograd2x2_3x3GemmInputChannelwise {
+    static void execute(
+        const T *inptr,
+        const Tensor4DShape& input_shape,
+        const PaddingType padding_type,
+        const int tile_M,
+        const int tile_N,
+        T *outptr_base,
+        const int matrix_stride,
+        const int matrix_batch_stride,
+        const int matrix_row_stride
+    );
+
+    static size_t bytes_read(const Tensor4DShape &input_shape,
+                           const Tensor4DShape &output_shape) {
+      // We read as many bytes as we write
+      return bytes_written(input_shape, output_shape);
+    }
+
+    static int flops_performed(const Tensor4DShape &input_shape,
+                                const Tensor4DShape &output_shape) {
+      const int tile_rows = iceildiv(output_shape.n_rows, 2);
+      const int tile_cols = iceildiv(output_shape.n_cols, 2);
+      return input_shape.n_batches * tile_rows * 32 * tile_cols * input_shape.n_channels;
+    }
+
+    static size_t bytes_written(const Tensor4DShape &input_shape,
+                              const Tensor4DShape &output_shape) {
+      return winograd::Winograd2x2_3x3GemmInput<T>::bytes_written(input_shape, output_shape);
+    }
+
+    protected:
+    typedef void (*tilefunc)(int, const T*, int, int, T*, int);
+    template <const int pad_top,
+              const int pad_left,
+              const int pad_bottom,
+              const int pad_right>
+    static void process_tile(
+        int n_channels,  // Number of channels in the tile
+        const T* const input_base,
+        const int input_row_stride,
+        const int input_col_stride,
+        T* const matrix_base,
+        const int matrix_stride
+    );
+
+    private:
+    template <const int pad_top,
+              const int pad_left,
+              const int pad_bottom,
+              const int pad_right,
+              const int proc_channels>
+    static void _process_tile(
+        int &n_channels, const T* &inptr,
+        const int input_row_stride, const int input_col_stride,
+        T* &outptr, const int matrix_stride
+    );
+  };
+}
+
+/*****************************************************************************/
+// Include specialised implementations here
+#include "input_2x2_3x3/a64_float.hpp"
+#include "input_2x2_3x3/a64_float_channelwise.hpp"
+/*****************************************************************************/
+
+/*****************************************************************************/
+template <typename T>
+void winograd::Winograd2x2_3x3GemmInput<T>::execute(
+    const T *inptr_base,
+    const Tensor4DShape& input_shape,
+    const PaddingType padding_type,
+    const int tile_M,
+    const int tile_N,
+    T *outptr_base,
+    const int matrix_stride,
+    const int matrix_batch_stride,
+    const int matrix_row_stride
+) {
+  // Select an appropriate matrix processing method for the shape and padding
+  // of the input tensor.
+  typedef void (*tensorfunc)(int, int, int, const T*, int, int, T*, int, int);
+  const auto process_tensor = [&padding_type, &input_shape] () -> tensorfunc {
+    if (padding_type == PADDING_VALID) {
+      const int pad_bottom = input_shape.n_rows % 2;
+      const int pad_right = input_shape.n_cols % 2;
+
+      if (pad_bottom == 0 && pad_right == 0) {
+        return process_tile_tensor<PADDING_VALID, 0, 0>;
+      } else if (pad_bottom == 0 && pad_right == 1) {
+        return process_tile_tensor<PADDING_VALID, 0, 1>;
+      } else if (pad_bottom == 1 && pad_right == 0) {
+        return process_tile_tensor<PADDING_VALID, 1, 0>;
+      } else if (pad_bottom == 1 && pad_right == 1) {
+        return process_tile_tensor<PADDING_VALID, 1, 1>;
+      }
+    } else {  // PADDING_SAME
+      const int pad_bottom = 1 + input_shape.n_rows % 2;
+      const int pad_right = 1 + input_shape.n_cols % 2;
+
+      if (pad_bottom == 1 && pad_right == 1) {
+        return process_tile_tensor<PADDING_SAME, 1, 1>;
+      } else if (pad_bottom == 1 && pad_right == 2) {
+        return process_tile_tensor<PADDING_SAME, 1, 2>;
+      } else if (pad_bottom == 2 && pad_right == 1) {
+        return process_tile_tensor<PADDING_SAME, 2, 1>;
+      } else if (pad_bottom == 2 && pad_right == 2) {
+        return process_tile_tensor<PADDING_SAME, 2, 2>;
+      }
+    }
+
+    printf("%s::%u Uncovered case.\n", __FILE__, __LINE__);
+    exit(-1);
+    return NULL;  // No function found
+  } ();
+
+  // Compute strides
+  const int input_row_stride = input_shape.n_cols * input_shape.n_channels;
+  const int input_col_stride = input_shape.n_channels;
+
+  // Process each batch of the tensor in turn.
+  for (int batch = 0; batch < input_shape.n_batches; batch++) {
+    // Work out pointers
+    const T *inptr = inptr_base + (batch * input_shape.n_rows *
+                                   input_shape.n_cols * input_shape.n_channels);
+    T *outptr = outptr_base + batch * matrix_batch_stride;
+
+    // Delegate doing the actual work
+    process_tensor(
+      tile_M, tile_N, input_shape.n_channels,
+      inptr, input_row_stride, input_col_stride,
+      outptr, matrix_stride, matrix_row_stride
+    );
+  }
+}
+
+/*****************************************************************************/
+template <typename T>
+template <const PaddingType padding, const int pad_bottom, const int pad_right>
+void winograd::Winograd2x2_3x3GemmInput<T>::process_tile_tensor(
+    const int tile_M,      // Number of rows of tiles
+    const int tile_N,      // Number of columns of tiles
+    int n_channels,  // Number of input channels
+    const T* const input,  // Base input pointer (appropriate to batch and channel)
+    const int input_row_stride,  // Stride between rows of the input
+    const int input_col_stride,  // Stride between columns of the input
+    T* const matrix,              // 1st output matrix (appropriate to batch and channel)
+    const int matrix_stride,      // Stride between matrices
+    const int matrix_row_stride   // Stride between rows of the output matrix
+) {
+  // Base row processing functions
+  typedef void (*rowfunc)(int, const T*, int, int, T*, int, int);
+  const rowfunc process_top_row[3] = {
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 1>
+      : process_tile_row<1, 1, 0, pad_right, 1>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 2>
+      : process_tile_row<1, 1, 0, pad_right, 2>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 4>
+      : process_tile_row<1, 1, 0, pad_right, 4>,
+  };
+  const rowfunc process_middle_row[3] = {
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 1>
+      : process_tile_row<0, 1, 0, pad_right, 1>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 2>
+      : process_tile_row<0, 1, 0, pad_right, 2>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, 0, pad_right, 4>
+      : process_tile_row<0, 1, 0, pad_right, 4>,
+  };
+  const rowfunc process_bottom_row[3] = {
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, pad_bottom, pad_right, 1>
+      : process_tile_row<0, 1, pad_bottom, pad_right, 1>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, pad_bottom, pad_right, 2>
+      : process_tile_row<0, 1, pad_bottom, pad_right, 2>,
+    (padding == PADDING_VALID)
+      ? process_tile_row<0, 0, pad_bottom, pad_right, 4>
+      : process_tile_row<0, 1, pad_bottom, pad_right, 4>,
+  };
+
+  // Method to get an input pointer for the given tile row
+  const auto get_inptr = [&input, &input_row_stride] (const int tile_i) {
+    if (padding == PADDING_VALID) {
+      return input + 2 * tile_i * input_row_stride;
+    } else {
+      return input + (2 * tile_i - (tile_i ? 1 : 0)) * input_row_stride;
+    }
+  };
+
+  // Wrapper to process a row of tiles, covering all channels.
+  const auto process_row =
+    [tile_N, input_row_stride, input_col_stride, matrix_stride, matrix_row_stride, n_channels]
+    (const rowfunc f[3], const T *inptr, T *outptr) {
+      int rem_channels = n_channels;
+
+      // While there remain channels to process continue to process the
+      // row.
+      for (; rem_channels >= 4; rem_channels -= 4, inptr += 4, outptr += 4) {
+        f[2](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
+      }
+      for (; rem_channels >= 2; rem_channels -= 2, inptr += 2, outptr += 2) {
+        f[1](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
+      }
+      if (rem_channels) {
+        f[0](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
+      }
+  };
+
+  // Process all rows of tiles in the tensor
+  for (int tile_i = 0; tile_i < tile_M; tile_i++) {
+    T* const m_row = matrix + tile_i * tile_N * matrix_row_stride;
+    const T *row_inptr = get_inptr(tile_i);
+
+    if (tile_i == 0) {
+      // Top row of the input
+      process_row(process_top_row, row_inptr, m_row);
+    } else if (tile_i == tile_M - 1) {
+      // Bottom row of the input
+      process_row(process_bottom_row, row_inptr, m_row);
+    } else {
+      // Any other row of the input
+      process_row(process_middle_row, row_inptr, m_row);
+    }
+  }
+}
+
+/*****************************************************************************/
+template <typename T>
+template <const int pad_top, const int pad_left,
+          const int pad_bottom, const int pad_right,
+          const int proc_channels>
+void winograd::Winograd2x2_3x3GemmInput<T>::process_tile_row(
+    const int tile_N,      // Number of tiles in the row
+    const T* const input,  // Base input pointer (appropriate to batch, channel and row)
+    const int input_row_stride,  // Stride between rows of the input
+    const int input_col_stride,  // Stride between columns of the input
+    T* const matrix,              // 1st output matrix (appropriate to batch, channel and row)
+    const int matrix_stride,      // Stride between matrices
+    const int matrix_row_stride   // Stride between rows of the output matrix
+) {
+  // Construct copies of the pointers
+  const T *inptr = input;
+  T *outptr = matrix;
+
+  // Storage for the tensors x, X.T x, and X.T x X.
+  T x[4][4][proc_channels], XTx[4][4][proc_channels], XTxX[4][4][proc_channels];
+
+  // For every tile in the row
+  for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+    // Determine the padding for the tile
+    const int tile_pad_left = (tile_j == 0) ? pad_left : 0;
+    const int tile_pad_right = (tile_j == tile_N - 1) ? pad_right : 0;
+
+    // Load tile values. If this is the first tile in the row then we must load
+    // all values, otherwise we can just load the final two columns of the input.
+    for (int i = 0; i < 4; i++) {
+      for (int j = ((tile_j == 0) ? 0 : 2); j < 4; j++) {
+        // Fill with padding if required
+        if (i < pad_top || 4 - pad_bottom <= i ||
+            j < tile_pad_left || 4 - tile_pad_right <= j) {
+          for (int c = 0; c < proc_channels; c++) {
+            x[i][j][c] = static_cast<T>(0);  // Padding
+          }
+        } else {
+          // Load values, note that the initial padding offsets the pointer we
+          // were provided.
+          for (int c = 0; c < proc_channels; c++) {
+            const int row_offset = (i - pad_top) * input_row_stride;
+            const int col_offset = (j - tile_pad_left) * input_col_stride;
+            x[i][j][c] = inptr[row_offset + col_offset + c];
+          }
+        }
+      }
+    }
+
+    // Compute the matrix X.T x.  Note, can elide operations depending on the
+    // padding. Furthermore, if this isn't the left-most tile we can skip half
+    // of the operations by copying results from the previous version of X.T x.
+    // This latter optimisation can be simplified by unrolling the outermost
+    // loop by two and by renaming the registers containing XTx.
+    if (tile_j == 0) {
+      for (int j = 0; j < 4; j++) {
+        for (int c = 0; c < proc_channels; c++) {
+          XTx[0][j][c] =  x[0][j][c] - x[2][j][c];
+          XTx[1][j][c] =  x[1][j][c] + x[2][j][c];
+          XTx[2][j][c] = -x[1][j][c] + x[2][j][c];
+          XTx[3][j][c] =  x[1][j][c] - x[3][j][c];
+        }
+      }
+    } else {
+      for (int j = 0; j < 2; j++) {
+        for (int c = 0; c < proc_channels; c++) {
+          XTx[0][j][c] = XTx[0][j + 2][c];
+          XTx[1][j][c] = XTx[1][j + 2][c];
+          XTx[2][j][c] = XTx[2][j + 2][c];
+          XTx[3][j][c] = XTx[3][j + 2][c];
+        }
+      }
+      for (int j = 2; j < 4; j++) {
+        for (int c = 0; c < proc_channels; c++) {
+          XTx[0][j][c] =  x[0][j][c] - x[2][j][c];
+          XTx[1][j][c] =  x[1][j][c] + x[2][j][c];
+          XTx[2][j][c] = -x[1][j][c] + x[2][j][c];
+          XTx[3][j][c] =  x[1][j][c] - x[3][j][c];
+        }
+      }
+    }
+
+    // Compute the matrix X.T x X. Note, can elide operations based on the
+    // padding.
+    for (int i = 0; i < 4; i++) {
+      for (int c = 0; c < proc_channels; c++) {
+        XTxX[i][0][c] =  XTx[i][0][c] - XTx[i][2][c];
+        XTxX[i][1][c] =  XTx[i][1][c] + XTx[i][2][c];
+        XTxX[i][2][c] = -XTx[i][1][c] + XTx[i][2][c];
+        XTxX[i][3][c] =  XTx[i][1][c] - XTx[i][3][c];
+      }
+    }
+
+    // Store the output matrix (X.T x X)
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        // Get a pointer to the relevant output matrix
+        T *mptr = outptr + (i*4 + j)*matrix_stride;
+
+        // Write out the channels
+        for (int c = 0; c < proc_channels; c++) {
+          mptr[c] = XTxX[i][j][c];
+        }
+      }
+    }
+
+    // Update the pointers
+    inptr += input_col_stride * ((tile_j == 0 && pad_left) ? 1 : 2);
+    outptr += matrix_row_stride;
+  }
+}
+
+/*****************************************************************************/
+template <typename T>
+void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::execute(
+    const T *inptr,
+    const Tensor4DShape& input_shape,
+    const PaddingType padding_type,
+    const int tile_M,
+    const int tile_N,
+    T *outptr_base,
+    const int matrix_stride,
+    const int matrix_batch_stride,
+    const int matrix_row_stride
+) {
+  const int n_channels = input_shape.n_channels;
+  const int input_col_stride = n_channels;
+  const int input_row_stride = input_shape.n_cols * input_col_stride;
+
+  // Determine the padding and hence select appropriate methods for each tile.
+  tilefunc fs[3][3];
+
+  if (padding_type == PADDING_VALID) {
+    constexpr int pad_top = 0;
+    constexpr int pad_left = 0;
+    const int pad_right = input_shape.n_cols % 2 == 0;
+
+    fs[0][0] = process_tile<pad_top, pad_left, 0, 0>;
+    fs[0][1] = process_tile<pad_top, 0, 0, 0>;
+    fs[0][2] = (pad_right) ? process_tile<pad_top, 0, 0, 0> : process_tile<pad_top, 0, 0, 1>;
+
+    fs[1][0] = process_tile<0, pad_left, 0, 0>;
+    fs[1][1] = process_tile<0, 0, 0, 0>;
+    fs[1][2] = (pad_right) ? process_tile<0, 0, 0, 0> : process_tile<0, 0, 0, 1>;
+
+    if (input_shape.n_rows % 2 == 0) {
+      constexpr int pad_bottom = 0;
+      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
+      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
+      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 0> : process_tile<0, 0, pad_bottom, 1>;
+    } else {
+      constexpr int pad_bottom = 1;
+      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
+      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
+      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 0> : process_tile<0, 0, pad_bottom, 1>;
+    }
+  } else {
+    constexpr int pad_top = 1;
+    constexpr int pad_left = 1;
+    const int pad_right = input_shape.n_cols % 2 == 0;
+
+    fs[0][0] = process_tile<pad_top, pad_left, 0, 0>;
+    fs[0][1] = process_tile<pad_top, 0, 0, 0>;
+    fs[0][2] = (pad_right) ? process_tile<pad_top, 0, 0, 1> : process_tile<pad_top, 0, 0, 2>;
+
+    fs[1][0] = process_tile<0, pad_left, 0, 0>;
+    fs[1][1] = process_tile<0, 0, 0, 0>;
+    fs[1][2] = (pad_right) ? process_tile<0, 0, 0, 1> : process_tile<0, 0, 0, 2>;
+
+    if (input_shape.n_rows % 2 == 0) {
+      constexpr int pad_bottom = 1;
+      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
+      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
+      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 1> : process_tile<0, 0, pad_bottom, 2>;
+    } else {
+      constexpr int pad_bottom = 2;
+      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
+      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
+      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 1> : process_tile<0, 0, pad_bottom, 2>;
+    }
+  }
+
+  // Process each tile in turn
+  for (int batch = 0; batch < input_shape.n_batches; batch++) {
+    const T* const input_base_batch = inptr + batch*input_shape.n_rows*input_shape.n_cols*n_channels;
+
+    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
+      const int row_offset = (tile_i == 0) ? 0 : ((padding_type == PADDING_VALID) ? 0 : 1);
+      const T* const input_base_row = input_base_batch + (2*tile_i - row_offset)*input_shape.n_cols*n_channels;
+
+      // Select the set of functions for the row
+      const int fs_i = (tile_i == 0) ? 0 : ((tile_i < tile_M - 1) ? 1 : 2);
+
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        // Select the function for the column
+        const int fs_j = (tile_j == 0) ? 0 : ((tile_j < tile_N - 1) ? 1 : 2);
+        const auto f = fs[fs_i][fs_j];
+
+        // Get pointers into the input and outputs
+        const int col_offset = (tile_j == 0) ? 0 : ((padding_type == PADDING_VALID) ? 0 : 1);
+        const T* const input_base_col = input_base_row + (2*tile_j - col_offset)*n_channels;
+        T* const matrix_base = outptr_base + batch*matrix_batch_stride + (tile_i*tile_N + tile_j)*matrix_row_stride;
+        f(n_channels, input_base_col, input_row_stride, input_col_stride,
+          matrix_base, matrix_stride);
+      }
+    }
+  }
+}
+
+template <typename T>
+template <const int pad_top,
+          const int pad_left,
+          const int pad_bottom,
+          const int pad_right>
+void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::process_tile(
+    int n_channels,  // Number of channels in the tile
+    const T* const input_base,
+    const int input_row_stride,
+    const int input_col_stride,
+    T* const matrix_base,
+    const int matrix_stride
+) {
+  // Copy pointers
+  const T *inptr = input_base;
+  T *outptr = matrix_base;
+
+  // Process channels (modifies inptr, outptr and n_channels)
+  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 4>(
+    n_channels, inptr, input_row_stride, input_col_stride,
+    outptr, matrix_stride
+  );
+  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 2>(
+    n_channels, inptr, input_row_stride, input_col_stride,
+    outptr, matrix_stride
+  );
+  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 1>(
+    n_channels, inptr, input_row_stride, input_col_stride,
+    outptr, matrix_stride
+  );
+}
+
+template <typename T>
+template <const int pad_top,
+          const int pad_left,
+          const int pad_bottom,
+          const int pad_right,
+          const int proc_channels>
+void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::_process_tile(
+    int &n_channels,
+    const T* &inptr, const int input_row_stride, const int input_col_stride,
+    T* &outptr, const int matrix_stride
+) {
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  T* outptrs[4] = {
+    outptr,
+    outptr + matrix_stride * 4,
+    outptr + matrix_stride * 8,
+    outptr + matrix_stride * 12
+  };
+
+  // The matrix X; zeroed to account for padding.
+  T x[4][4];
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      x[i][j] = 0;
+    }
+  }
+
+  // The matrices X.T x and U
+  T XTx[4][4], U[4][4];
+
+  // Now progress through each channel
+  for (; n_channels >= proc_channels; n_channels -= proc_channels) {
+    for (int n = 0; n < proc_channels; n++) {
+      // Load the matrix X
+      for (int cell_i = pad_top, i = 0; cell_i < 4 - pad_bottom; cell_i++, i++) {
+        for (int cell_j = pad_left, j = 0; cell_j < 4 - pad_right; cell_j++, j++) {
+          x[cell_i][cell_j] = inptr[i*input_row_stride + j*input_col_stride];
+        }
+      }
+      inptr++;
+
+      // Compute the matrix X.T
+      for (int j = 0; j < 4; j++) {
+        XTx[0][j] = x[0][j] - x[2][j];
+        XTx[1][j] = x[1][j] + x[2][j];
+        XTx[2][j] = x[2][j] - x[1][j];
+        XTx[3][j] = x[1][j] - x[3][j];
+      }
+
+      // Hence compute the matrix U
+      for (int i = 0; i < 4; i++) {
+        U[i][0] = XTx[i][0] - XTx[i][2];
+        U[i][1] = XTx[i][1] + XTx[i][2];
+        U[i][2] = XTx[i][2] - XTx[i][1];
+        U[i][3] = XTx[i][1] - XTx[i][3];
+      }
+
+      // Store the matrix U
+      for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+          outptrs[i][j * matrix_stride] = U[i][j];
+        }
+        outptrs[i]++;
+      }
+    }
+  }
+
+  // Update the output pointer for future calls
+  outptr = outptrs[0];
+}

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp
new file mode 100644
index 0000000..6c7f136
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp

@@ -0,0 +1,1491 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include "../input_2x2_3x3.hpp"
+
+#ifdef __aarch64__
+namespace winograd {
+
+// Pad left by one column, pad right by one column, no upper or lower padding, 4 channels
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<0, 1, 0, 1, 4>(
+    const int tile_N,            // Number of tiles in the row
+    const float* const input,    // Base input pointer (appropriate to batch, channel and row)
+    const int input_row_stride,  // Stride between rows of the input
+    const int input_col_stride,  // Stride between columns of the input
+    float* const matrix,         // 1st output matrix (appropriate to batch, channel and row)
+    const int matrix_stride,     // Stride between matrices
+    const int matrix_row_stride  // Stride between rows of the output matrix
+) {
+  /* SIMD register allocation
+   * ========================
+   *
+   * In the following code we read 4x4 tiles of a matrix `x`, with which we
+   * compute another matrix `X.T x` where:
+   *
+   *         /  1  0  0  0 \
+   *     X = |  0  1 -1  1 |
+   *         | -1  1  1  0 |
+   *         \  0  0  0 -1 /
+   *
+   * Hence, `X.T` is a program which operates upon rows of the matrix `X`.
+   * We subsequently compute and store the matrix `U = (X.T x) X`.
+   *
+   * Importantly, each iteration of the loop below loads a new matrix `x'`
+   * where the final two columns of `x'` are the first two columns of the
+   * previous `x`. That is:
+   *
+   *   x11  x12  x13  x14
+   *   x21  x22  x23  x24
+   *   x31  x32  x33  x34
+   *   x41  x42  x43  x44
+   *
+   *            x'11 x'12 x'13 x'14
+   *            x'21 x'22 x'23 x'24
+   *            x'31 x'32 x'33 x'34
+   *            x'41 x'42 x'43 x'44
+   *
+   * Consequently, while the first iteration of the below loop must load 16
+   * values for `x`, the second need load only 8. *Furthermore*, since we noted
+   * above that the operation `X.T x` was a program which operated upon *rows*
+   * of the matrix `x` it follows that that the relation that `x'[i][1] =
+   * x[i][3]` and `x'[i][2] = x[i][4]` applies also the matrices `X.T x'` and
+   * `X.T x`. That is:
+   *
+   *   (X.T x)11  (X.T x)12  (X.T x)13  (X.T x)14
+   *   (X.T x)21  (X.T x)22  (X.T x)23  (X.T x)24
+   *   (X.T x)31  (X.T x)32  (X.T x)33  (X.T x)34
+   *   (X.T x)41  (X.T x)42  (X.T x)43  (X.T x)44
+   *
+   *                        (X.T x')11 (X.T x')12 (X.T x')13 (X.T x')14
+   *                        (X.T x')12 (X.T x')12 (X.T x')12 (X.T x')12
+   *                        (X.T x')13 (X.T x')13 (X.T x')13 (X.T x')13
+   *                        (X.T x')14 (X.T x')14 (X.T x')14 (X.T x')14
+   *
+   * Hence, as well as not needing to load new values for x'[i][1..2] it is
+   * also unnecessary to recompute values for (X.T x')[i][1..2].
+   *
+   * Following this we break the registers into blocks `A` and `B` used by the
+   * two stages of the unrolled loop. These registers named such that the
+   * latter columns of `A` become the earlier columns of `B` and vice-versa:
+   *
+   *  AXTx11 AXTx12 > AXTx13 AXTx14 |
+   *  AXTx21 AXTx22 > AXTx23 AXTx24 |
+   *  AXTx31 AXTx32 > AXTx33 AXTx34 |
+   *  AXTx41 AXTx42 > AXTx43 AXTx44 |
+   *
+   *  BXTx13 BXTx14 | BXTx11 BXTx12 >
+   *  BXTx23 BXTx24 | BXTx21 BXTx22 >
+   *  BXTx33 BXTx34 | BXTx31 BXTx32 >
+   *  BXTx43 BXTx44 | BXTx41 BXTx42 >
+   *
+   * These 32 named registers require only 16 architectural registers. 1
+   * additional architectural register is used as scratch space and 8
+   * architectural registers are used to load in the values x[1..4][3,4].
+   *
+   * Input and output addressing
+   * ===========================
+   */
+  const float *inptr0 = input;
+  const float *inptr1 = input + input_row_stride;
+  const float *inptr2 = input + input_row_stride * 2;
+  const float *inptr3 = input + input_row_stride * 3;
+
+  float *outptr0 = matrix;
+  float *outptr4 = matrix + matrix_stride * 4;
+  float *outptr8 = matrix + matrix_stride * 8;
+  float *outptr12 = matrix + matrix_stride * 12;
+
+  int tile_j = tile_N;  // Tiles to process
+
+  asm volatile (
+      // Named SIMD registers according to the policy given above
+      // Registers into which to load the latter two columns of `x`
+      "x_13 .req v0\n qx_13 .req q0\n" "x_14 .req v4\n qx_14 .req q4\n"
+      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
+      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
+      "x_43 .req v3\n qx_43 .req q3\n" "x_44 .req v7\n qx_44 .req q7\n"
+
+      // Registers for storing X.T x (both A and B halves)
+      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
+      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
+      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
+      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
+      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
+      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
+      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
+      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
+      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
+      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
+      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
+      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
+      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
+      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
+      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
+      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
+
+      "U .req v24\n qU .req q24\n"
+
+      // ----------------------------------------------------------------------
+      // Head of loop
+      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
+      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
+      //   NOTE: Since the first tile has the leftmost column padded we can
+      //   skip 4 loads and 4 calculations for the matrix X.T x X.
+
+      // Temporarily alias registers for computing the first (non-padded)
+      // column of x.
+      "x_12 .req v0\n qx_12 .req q0\n"
+      "x_22 .req v1\n qx_22 .req q1\n"
+      "x_32 .req v2\n qx_32 .req q2\n"
+      "x_42 .req v3\n qx_42 .req q3\n"
+
+      "ldr qx_12, [%x[inptr0]]\n"
+      "ldr qx_22, [%x[inptr1]]\n"
+      "ldr qx_32, [%x[inptr2]]\n"
+      "ldr qx_42, [%x[inptr3]]\n"
+
+      "fsub BXTx12.4s, x_12.4s, x_32.4s\n"
+      "fadd BXTx22.4s, x_22.4s, x_32.4s\n"
+      "fsub BXTx32.4s, x_32.4s, x_22.4s\n"
+      "fsub BXTx42.4s, x_22.4s, x_42.4s\n"
+
+      ".unreq x_12\n .unreq qx_12\n"
+      ".unreq x_22\n .unreq qx_22\n"
+      ".unreq x_32\n .unreq qx_32\n"
+      ".unreq x_42\n .unreq qx_42\n"
+
+      // Load and compute latter two columns of the first tile. Progress the
+      // input pointers (by three columns so that the each points are the
+      // second column of the next tile, that is, each points at the first
+      // column which must be read for the next tile.
+      "ldr qx_13, [%x[inptr0], %x[colstride1]]\n"
+      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
+      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
+      "ldr qx_43, [%x[inptr3], %x[colstride1]]\n"
+
+      "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+      "ldr qx_14, [%x[inptr0], %x[colstride2]]\n"
+
+      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
+
+      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
+
+      "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+      "ldr qx_44, [%x[inptr3], %x[colstride2]]\n"
+
+      "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
+      "add %x[inptr0],  %x[inptr0], %x[colstride3]\n"
+
+      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
+
+      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
+
+      "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
+      "add %x[inptr3], %x[inptr3], %x[colstride3]\n"
+
+      // Compute and store U for the first tile
+      // First row
+      "fneg U.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0]]\n"
+      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0], %x[mstride1]]\n"
+      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+      "str qU, [%x[outptr0], %x[mstride2]]\n"
+      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+      "str qU, [%x[outptr0], %x[mstride3]]\n"
+      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+      // Second row
+      "fneg U.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4]]\n"
+      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4], %x[mstride1]]\n"
+      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+      "str qU, [%x[outptr4], %x[mstride2]]\n"
+      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+      "str qU, [%x[outptr4], %x[mstride3]]\n"
+      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+      // Third row
+      "fneg U.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8]]\n"
+      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8], %x[mstride1]]\n"
+      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+      "str qU, [%x[outptr8], %x[mstride2]]\n"
+      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+      "str qU, [%x[outptr8], %x[mstride3]]\n"
+      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+      // Fourth row, simultaneously load the first column of inputs for the
+      // next tile.
+      "fneg U.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12]]\n"
+      "ldr qx_13, [%x[inptr0]]\n"
+
+      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12], %x[mstride1]]\n"
+      "ldr qx_23, [%x[inptr1]]\n"
+
+      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+      "str qU, [%x[outptr12], %x[mstride2]]\n"
+      "ldr qx_33, [%x[inptr2]]\n"
+
+      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+      "str qU, [%x[outptr12], %x[mstride3]]\n"
+      "ldr qx_43, [%x[inptr3]]\n"
+
+      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+      // Update the loop counter, subtract two to account for both the head and
+      // the tail.
+      "subs %x[tile_j], %x[tile_j], #2\n"
+      "beq 2f\n"  // Jump to "A" tail if out of tiles
+
+      // ----------------------------------------------------------------------
+      "1:"
+        // Start part A
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
+        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
+        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
+        "fsub AXTx14.4s, x_14.4s, x_34.4s\n"
+        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
+        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "fsub AXTx44.4s, x_24.4s, x_44.4s\n"
+        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "ldr qx_13, [%x[inptr0]]\n"
+
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "ldr qx_43, [%x[inptr3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "beq 3f\n"  // Jump to 'B' tail
+
+        // Start part B
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
+        "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
+        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
+        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
+        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "ldr qx_13, [%x[inptr0]]\n"
+
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "ldr qx_43, [%x[inptr3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
+
+      // ----------------------------------------------------------------------
+      "2:"
+        // 'A' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+        "b 4f\n"  // Jump to end of function
+
+      // ----------------------------------------------------------------------
+      "3:"
+        // 'B' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+      // ----------------------------------------------------------------------
+      "4:"
+        // End of function
+
+      // Clear names
+      ".unreq x_13\n" ".unreq qx_13\n" ".unreq x_14\n" ".unreq qx_14\n"
+      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
+      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
+      ".unreq x_43\n" ".unreq qx_43\n" ".unreq x_44\n" ".unreq qx_44\n"
+      ".unreq AXTx11\n" ".unreq BXTx13\n"
+      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
+      ".unreq AXTx21\n" ".unreq BXTx23\n"
+      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
+      ".unreq AXTx31\n" ".unreq BXTx33\n"
+      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
+      ".unreq AXTx41\n" ".unreq BXTx43\n"
+      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
+      ".unreq AXTx13\n" ".unreq BXTx11\n"
+      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
+      ".unreq AXTx23\n" ".unreq BXTx21\n"
+      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
+      ".unreq AXTx33\n" ".unreq BXTx31\n"
+      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
+      ".unreq AXTx43\n" ".unreq BXTx41\n"
+      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
+      ".unreq U\n" ".unreq qU\n"
+    : [inptr0] "+r" (inptr0),
+      [inptr1] "+r" (inptr1),
+      [inptr2] "+r" (inptr2),
+      [inptr3] "+r" (inptr3),
+      [outptr0] "+r" (outptr0),
+      [outptr4] "+r" (outptr4),
+      [outptr8] "+r" (outptr8),
+      [outptr12] "+r" (outptr12),
+      [tile_j] "+r" (tile_j)  // Tile counter
+    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
+      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
+      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
+      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
+      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
+      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
+      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+      "v22", "v23", "v24"
+  );
+}
+
+// Pad top, left and right by 1.
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<1, 1, 0, 1, 4>(
+    const int tile_N,
+    const float* const input,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* const matrix,
+    const int matrix_stride,
+    const int matrix_row_stride
+) {
+  const float *inptr0 = input;
+  const float *inptr1 = input + input_row_stride;
+  const float *inptr2 = input + input_row_stride * 2;
+
+  float *outptr0 = matrix;
+  float *outptr4 = matrix + matrix_stride * 4;
+  float *outptr8 = matrix + matrix_stride * 8;
+  float *outptr12 = matrix + matrix_stride * 12;
+
+  int tile_j = tile_N;  // Tiles to process
+
+  asm volatile (
+      // Named SIMD registers according to the policy given above
+      // Registers into which to load the latter two columns of `x`
+      // NOTE: We need only load the latter three rows since we know that the
+      // first row is padded.
+      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
+      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
+      "x_43 .req v3\n qx_43 .req q3\n" "x_44 .req v7\n qx_44 .req q7\n"
+
+      // Registers for storing X.T x (both A and B halves)
+      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
+      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
+      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
+      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
+      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
+      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
+      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
+      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
+      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
+      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
+      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
+      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
+      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
+      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
+      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
+      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
+
+      "U .req v24\n qU .req q24\n"
+
+      // ----------------------------------------------------------------------
+      // Head of loop
+      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
+      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
+      //   NOTE: Since the first tile has the leftmost column padded we can
+      //   skip 4 loads and 4 calculations for the matrix X.T x X.
+
+      // Temporarily alias registers for computing the first (non-padded)
+      // column of x.
+      "x_22 .req v1\n qx_22 .req q1\n"
+      "x_32 .req v2\n qx_32 .req q2\n"
+      "x_42 .req v3\n qx_42 .req q3\n"
+
+      "ldr qx_22, [%x[inptr1]]\n"
+      "ldr qx_32, [%x[inptr2]]\n"
+      "ldr qx_42, [%x[inptr3]]\n"
+
+      "fneg BXTx12.4s,          x_32.4s\n"
+      "fadd BXTx22.4s, x_22.4s, x_32.4s\n"
+      "fsub BXTx32.4s, x_32.4s, x_22.4s\n"
+      "fsub BXTx42.4s, x_22.4s, x_42.4s\n"
+
+      ".unreq x_22\n .unreq qx_22\n"
+      ".unreq x_32\n .unreq qx_32\n"
+      ".unreq x_42\n .unreq qx_42\n"
+
+      // Load and compute latter two columns of the first tile. Progress the
+      // input pointers (by three columns so that the each points are the
+      // second column of the next tile, that is, each points at the first
+      // column which must be read for the next tile.
+      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
+      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
+      "ldr qx_43, [%x[inptr3], %x[colstride1]]\n"
+
+      "fneg BXTx13.4s,          x_33.4s\n"
+
+      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
+
+      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
+
+      "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+      "ldr qx_44, [%x[inptr3], %x[colstride2]]\n"
+
+      "fneg BXTx14.4s,          x_34.4s\n"
+
+      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
+
+      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
+
+      "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
+      "add %x[inptr3], %x[inptr3], %x[colstride3]\n"
+
+      // Compute and store U for the first tile
+      // First row
+      "fneg U.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0]]\n"
+      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0], %x[mstride1]]\n"
+      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+      "str qU, [%x[outptr0], %x[mstride2]]\n"
+      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+      "str qU, [%x[outptr0], %x[mstride3]]\n"
+      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+      // Second row
+      "fneg U.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4]]\n"
+      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4], %x[mstride1]]\n"
+      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+      "str qU, [%x[outptr4], %x[mstride2]]\n"
+      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+      "str qU, [%x[outptr4], %x[mstride3]]\n"
+      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+      // Third row
+      "fneg U.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8]]\n"
+      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8], %x[mstride1]]\n"
+      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+      "str qU, [%x[outptr8], %x[mstride2]]\n"
+      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+      "str qU, [%x[outptr8], %x[mstride3]]\n"
+      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+      // Fourth row, simultaneously load the first column of inputs for the
+      // next tile.
+      "fneg U.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12]]\n"
+
+      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12], %x[mstride1]]\n"
+      "ldr qx_23, [%x[inptr1]]\n"
+
+      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+      "str qU, [%x[outptr12], %x[mstride2]]\n"
+      "ldr qx_33, [%x[inptr2]]\n"
+
+      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+      "str qU, [%x[outptr12], %x[mstride3]]\n"
+      "ldr qx_43, [%x[inptr3]]\n"
+
+      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+      // Update the loop counter, subtract two to account for both the head and
+      // the tail.
+      "subs %x[tile_j], %x[tile_j], #2\n"
+      "beq 2f\n"  // Jump to "A" tail if out of tiles
+
+      // ----------------------------------------------------------------------
+      "1:"
+        // Start part A
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fneg AXTx13.4s,          x_33.4s\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
+        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
+        "fneg AXTx14.4s,          x_34.4s\n"
+        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "fsub AXTx44.4s, x_24.4s, x_44.4s\n"
+        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "ldr qx_43, [%x[inptr3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "beq 3f\n"  // Jump to 'B' tail
+
+        // Start part B
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fneg BXTx13.4s,          x_33.4s\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
+        "fneg BXTx14.4s,          x_34.4s\n"
+        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
+        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "ldr qx_43, [%x[inptr3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
+
+      // ----------------------------------------------------------------------
+      "2:"
+        // 'A' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fneg AXTx13.4s,          x_33.4s\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+        "b 4f\n"  // Jump to end of function
+
+      // ----------------------------------------------------------------------
+      "3:"
+        // 'B' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fneg BXTx13.4s,          x_33.4s\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+      // ----------------------------------------------------------------------
+      "4:"
+        // End of function
+
+      // Clear names
+      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
+      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
+      ".unreq x_43\n" ".unreq qx_43\n" ".unreq x_44\n" ".unreq qx_44\n"
+      ".unreq AXTx11\n" ".unreq BXTx13\n"
+      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
+      ".unreq AXTx21\n" ".unreq BXTx23\n"
+      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
+      ".unreq AXTx31\n" ".unreq BXTx33\n"
+      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
+      ".unreq AXTx41\n" ".unreq BXTx43\n"
+      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
+      ".unreq AXTx13\n" ".unreq BXTx11\n"
+      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
+      ".unreq AXTx23\n" ".unreq BXTx21\n"
+      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
+      ".unreq AXTx33\n" ".unreq BXTx31\n"
+      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
+      ".unreq AXTx43\n" ".unreq BXTx41\n"
+      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
+      ".unreq U\n" ".unreq qU\n"
+    : [inptr1] "+r" (inptr0),  // Offset to account for padded row
+      [inptr2] "+r" (inptr1),  // Offset to account for padded row
+      [inptr3] "+r" (inptr2),  // Offset to account for padded row
+      [outptr0] "+r" (outptr0),
+      [outptr4] "+r" (outptr4),
+      [outptr8] "+r" (outptr8),
+      [outptr12] "+r" (outptr12),
+      [tile_j] "+r" (tile_j)  // Tile counter
+    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
+      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
+      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
+      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
+      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
+      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
+      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+      "v22", "v23", "v24"
+  );
+}
+
+// Pad left, right and bottom by 1.
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<0, 1, 1, 1, 4>(
+    const int tile_N,
+    const float* const input,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* const matrix,
+    const int matrix_stride,
+    const int matrix_row_stride
+) {
+  const float *inptr0 = input;
+  const float *inptr1 = input + input_row_stride;
+  const float *inptr2 = input + input_row_stride * 2;
+
+  float *outptr0 = matrix;
+  float *outptr4 = matrix + matrix_stride * 4;
+  float *outptr8 = matrix + matrix_stride * 8;
+  float *outptr12 = matrix + matrix_stride * 12;
+
+  int tile_j = tile_N;  // Tiles to process
+
+  asm volatile (
+      // Named SIMD registers according to the policy given above
+      // Registers into which to load the latter two columns of `x`
+      // NOTE: Bottom row is not required since since it is padded.
+      "x_13 .req v0\n qx_13 .req q0\n" "x_14 .req v4\n qx_14 .req q4\n"
+      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
+      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
+
+      // Registers for storing X.T x (both A and B halves)
+      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
+      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
+      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
+      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
+      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
+      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
+      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
+      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
+      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
+      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
+      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
+      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
+      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
+      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
+      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
+      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
+
+      "U .req v24\n qU .req q24\n"
+
+      // ----------------------------------------------------------------------
+      // Head of loop
+      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
+      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
+      //   NOTE: Since the first tile has the leftmost column padded we can
+      //   skip 4 loads and 4 calculations for the matrix X.T x X.
+
+      // Temporarily alias registers for computing the first (non-padded)
+      // column of x.
+      "x_12 .req v0\n qx_12 .req q0\n"
+      "x_22 .req v1\n qx_22 .req q1\n"
+      "x_32 .req v2\n qx_32 .req q2\n"
+
+      "ldr qx_12, [%x[inptr0]]\n"
+      "ldr qx_22, [%x[inptr1]]\n"
+      "ldr qx_32, [%x[inptr2]]\n"
+
+      "fsub BXTx12.4s,  x_12.4s, x_32.4s\n"
+      "fadd BXTx22.4s,  x_22.4s, x_32.4s\n"
+      "fsub BXTx32.4s,  x_32.4s, x_22.4s\n"
+      "mov  BXTx42.16b, x_22.16b\n"  // Probably should do better
+
+      ".unreq x_12\n .unreq qx_12\n"
+      ".unreq x_22\n .unreq qx_22\n"
+      ".unreq x_32\n .unreq qx_32\n"
+
+      // Load and compute latter two columns of the first tile. Progress the
+      // input pointers (by three columns so that the each points are the
+      // second column of the next tile, that is, each points at the first
+      // column which must be read for the next tile.
+      "ldr qx_13, [%x[inptr0], %x[colstride1]]\n"
+      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
+      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
+
+      "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+      "ldr qx_14, [%x[inptr0], %x[colstride2]]\n"
+
+      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
+
+      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
+
+      "mov  BXTx43.16b, x_23.16b\n"
+      "fsub BXTx14.4s,  x_14.4s, x_34.4s\n"
+      "add %x[inptr0],  %x[inptr0], %x[colstride3]\n"
+
+      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
+
+      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
+
+      "mov BXTx44.16b, x_24.16b\n"
+
+      // Compute and store U for the first tile
+      // First row
+      "fneg U.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0]]\n"
+      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+      "str qU, [%x[outptr0], %x[mstride1]]\n"
+      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+      "str qU, [%x[outptr0], %x[mstride2]]\n"
+      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+      "str qU, [%x[outptr0], %x[mstride3]]\n"
+      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+      // Second row
+      "fneg U.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4]]\n"
+      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+      "str qU, [%x[outptr4], %x[mstride1]]\n"
+      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+      "str qU, [%x[outptr4], %x[mstride2]]\n"
+      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+      "str qU, [%x[outptr4], %x[mstride3]]\n"
+      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+      // Third row
+      "fneg U.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8]]\n"
+      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+      "str qU, [%x[outptr8], %x[mstride1]]\n"
+      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+      "str qU, [%x[outptr8], %x[mstride2]]\n"
+      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+      "str qU, [%x[outptr8], %x[mstride3]]\n"
+      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+      // Fourth row, simultaneously load the first column of inputs for the
+      // next tile.
+      "fneg U.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12]]\n"
+      "ldr qx_13, [%x[inptr0]]\n"
+
+      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+      "str qU, [%x[outptr12], %x[mstride1]]\n"
+      "ldr qx_23, [%x[inptr1]]\n"
+
+      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+      "str qU, [%x[outptr12], %x[mstride2]]\n"
+      "ldr qx_33, [%x[inptr2]]\n"
+
+      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+      "str qU, [%x[outptr12], %x[mstride3]]\n"
+
+      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+      // Update the loop counter, subtract two to account for both the head and
+      // the tail.
+      "subs %x[tile_j], %x[tile_j], #2\n"
+      "beq 2f\n"  // Jump to "A" tail if out of tiles
+
+      // ----------------------------------------------------------------------
+      "1:"
+        // Start part A
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
+        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "mov  AXTx43.16b, x_23.16b\n"
+
+        "fsub AXTx14.4s, x_14.4s, x_34.4s\n"
+        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
+        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "mov  AXTx44.16b, x_24.16b\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "ldr qx_13, [%x[inptr0]]\n"
+
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "beq 3f\n"  // Jump to 'B' tail
+
+        // Start part B
+        // Load last column of this tile (the first column has already been
+        // loaded) and compute latter two columns of X.T x.
+        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
+        "mov BXTx43.16b, x_23.16b\n"
+
+        "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
+        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
+        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
+        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
+        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
+        "mov BXTx44.16b, x_24.16b\n"
+
+        // Compute and store U.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "ldr qx_13, [%x[inptr0]]\n"
+
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "ldr qx_23, [%x[inptr1]]\n"
+
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "ldr qx_33, [%x[inptr2]]\n"
+
+        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+
+        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
+        "subs %x[tile_j], %x[tile_j], #1\n"
+        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
+
+      // ----------------------------------------------------------------------
+      "2:"
+        // 'A' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
+        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
+        "mov  AXTx43.16b, x_23.16b\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+        "b 4f\n"  // Jump to end of function
+
+      // ----------------------------------------------------------------------
+      "3:"
+        // 'B' tail
+        // Since the final column is padding and the last-but-one column has
+        // already been loaded just compute the 3rd column of `X.T x'.
+        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
+        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
+        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
+        "mov  BXTx43.16b, x_23.16b\n"
+
+        // Compute and store U. Modified to account for the final column of X.T
+        // x containing padding. Note, it is also unnecessary to update the
+        // output pointers.
+        // First row
+        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
+
+        // Second row
+        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
+
+        // Third row
+        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
+
+        // Fourth row
+        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
+
+      // ----------------------------------------------------------------------
+      "4:"
+        // End of function
+
+      // Clear names
+      ".unreq x_13\n" ".unreq qx_13\n" ".unreq x_14\n" ".unreq qx_14\n"
+      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
+      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
+      ".unreq AXTx11\n" ".unreq BXTx13\n"
+      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
+      ".unreq AXTx21\n" ".unreq BXTx23\n"
+      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
+      ".unreq AXTx31\n" ".unreq BXTx33\n"
+      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
+      ".unreq AXTx41\n" ".unreq BXTx43\n"
+      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
+      ".unreq AXTx13\n" ".unreq BXTx11\n"
+      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
+      ".unreq AXTx23\n" ".unreq BXTx21\n"
+      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
+      ".unreq AXTx33\n" ".unreq BXTx31\n"
+      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
+      ".unreq AXTx43\n" ".unreq BXTx41\n"
+      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
+      ".unreq U\n" ".unreq qU\n"
+    : [inptr0] "+r" (inptr0),
+      [inptr1] "+r" (inptr1),
+      [inptr2] "+r" (inptr2),
+      [outptr0] "+r" (outptr0),
+      [outptr4] "+r" (outptr4),
+      [outptr8] "+r" (outptr8),
+      [outptr12] "+r" (outptr12),
+      [tile_j] "+r" (tile_j)  // Tile counter
+    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
+      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
+      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
+      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
+      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
+      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
+      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
+    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+      "v22", "v23", "v24"
+  );
+}
+}
+#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp
new file mode 100644
index 0000000..ad1ad55
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp

@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include "../input_2x2_3x3.hpp"
+
+#ifdef __aarch64__
+
+namespace winograd {
+
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 0, 0, 4>(
+    int &n_channels,  // Number of channels in the tile
+    const float* &inptr0,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* &outptr0,
+    const int matrix_stride
+) {
+  // We use 4 pointers to point to the starting position on each row and use
+  // three offsets to extract elements from each of the other 3 columns.
+  auto inptr1 = inptr0 + 1*input_row_stride;
+  auto inptr2 = inptr0 + 2*input_row_stride;
+  auto inptr3 = inptr0 + 3*input_row_stride;
+
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  auto outptr1 = outptr0 + matrix_stride * 4;
+  auto outptr2 = outptr0 + matrix_stride * 8;
+  auto outptr3 = outptr0 + matrix_stride * 12;
+
+  for (; n_channels > 3; n_channels -= 4) {
+    asm volatile (
+        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
+        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
+        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
+        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
+        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
+        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
+        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
+        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
+        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
+        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
+        "X_33 .req v10\n"  "qX_33 .req q10\n"
+        "X_34 .req v11\n"  "qX_34 .req q11\n"
+        "X_41 .req v12\n"  "qX_41 .req q12\n"
+        "X_42 .req v13\n"  "qX_42 .req q13\n"
+        "X_43 .req v14\n"  "qX_43 .req q14\n"
+        "X_44 .req v15\n"  "qX_44 .req q15\n"
+        "xX_11 .req v16\n"
+        "xX_12 .req v17\n"
+        "xX_13 .req v18\n"
+        "xX_14 .req v19\n"
+        "xX_21 .req v20\n"
+        "xX_22 .req v21\n"
+        "xX_23 .req v22\n"
+        "xX_24 .req v23\n"
+        "xX_31 .req v24\n"
+        "xX_32 .req v25\n"
+        "xX_33 .req v26\n"
+        "xX_34 .req v27\n"
+        "xX_41 .req v28\n"
+        "xX_42 .req v29\n"
+        "xX_43 .req v30\n"
+        "xX_44 .req v31\n"
+        " U .req v0\n"
+        "qU .req q0\n"
+
+        // Load the tile, and compute compute the matrix xX
+        "ldr qX_11, [%x[inptr0]]\n"
+        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
+        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
+        "ldr qX_14, [%x[inptr0], %x[colstride3]]\n"
+        "add %x[inptr0], %x[inptr0], #0x10\n"
+
+        "ldr qX_21, [%x[inptr1]]\n"
+        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
+        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
+        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
+        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
+        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
+        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
+        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
+        "add %x[inptr1], %x[inptr1], #0x10\n"
+
+        "ldr qX_31, [%x[inptr2]]\n"
+        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
+        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
+        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
+        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
+        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
+        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
+        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], #0x10\n"
+
+        "ldr qX_41, [%x[inptr3]]\n"
+        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
+        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
+        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
+        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
+        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
+        "ldr qX_44, [%x[inptr3], %x[colstride3]]\n"
+        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
+        "add %x[inptr3], %x[inptr3], #0x10\n"
+
+        // Complete computing xX while beginning to compute and store
+        // $U = X.T x X$
+
+        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
+
+        "fsub U.4s, xX_11.4s, xX_31.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fsub U.4s, xX_12.4s, xX_32.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, xX_13.4s, xX_33.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, xX_14.4s, xX_34.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], #0x10\n"
+
+        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
+
+        "fadd U.4s, xX_21.4s, xX_31.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, xX_22.4s, xX_32.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fadd U.4s, xX_23.4s, xX_33.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fadd U.4s, xX_24.4s, xX_34.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], #0x10\n"
+
+        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
+
+        "fsub U.4s, xX_31.4s, xX_21.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fsub U.4s, xX_32.4s, xX_22.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, xX_33.4s, xX_23.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, xX_34.4s, xX_24.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], #0x10\n"
+
+        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
+
+        "fsub U.4s, xX_21.4s, xX_41.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fsub U.4s, xX_22.4s, xX_42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, xX_23.4s, xX_43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "fsub U.4s, xX_24.4s, xX_44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "add %x[outptr12], %x[outptr12], #0x10\n"
+
+        ".unreq qU\n"
+        ".unreq U\n"
+        ".unreq X_11\n"  ".unreq qX_11\n"
+        ".unreq X_12\n"  ".unreq qX_12\n"
+        ".unreq X_13\n"  ".unreq qX_13\n"
+        ".unreq X_14\n"  ".unreq qX_14\n"
+        ".unreq X_21\n"  ".unreq qX_21\n"
+        ".unreq X_22\n"  ".unreq qX_22\n"
+        ".unreq X_23\n"  ".unreq qX_23\n"
+        ".unreq X_24\n"  ".unreq qX_24\n"
+        ".unreq X_31\n"  ".unreq qX_31\n"
+        ".unreq X_32\n"  ".unreq qX_32\n"
+        ".unreq X_33\n"  ".unreq qX_33\n"
+        ".unreq X_34\n"  ".unreq qX_34\n"
+        ".unreq X_41\n"  ".unreq qX_41\n"
+        ".unreq X_42\n"  ".unreq qX_42\n"
+        ".unreq X_43\n"  ".unreq qX_43\n"
+        ".unreq X_44\n"  ".unreq qX_44\n"
+        ".unreq xX_11\n"
+        ".unreq xX_12\n"
+        ".unreq xX_13\n"
+        ".unreq xX_14\n"
+        ".unreq xX_21\n"
+        ".unreq xX_22\n"
+        ".unreq xX_23\n"
+        ".unreq xX_24\n"
+        ".unreq xX_31\n"
+        ".unreq xX_32\n"
+        ".unreq xX_33\n"
+        ".unreq xX_34\n"
+        ".unreq xX_41\n"
+        ".unreq xX_42\n"
+        ".unreq xX_43\n"
+        ".unreq xX_44\n"
+
+        : [inptr0] "+r" (inptr0),
+          [inptr1] "+r" (inptr1),
+          [inptr2] "+r" (inptr2),
+          [inptr3] "+r" (inptr3),
+          [outptr0] "+r" (outptr0),
+          [outptr4] "+r" (outptr1),
+          [outptr8] "+r" (outptr2),
+          [outptr12] "+r" (outptr3)
+        : [colstride1] "r" (input_col_stride * sizeof(float)),
+          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
+          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
+          [mstride1] "r" (matrix_stride * sizeof(float)),
+          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
+          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
+        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31"
+    );
+  }
+}
+
+// Pad top by 1
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<1, 0, 0, 0, 4>(
+    int &n_channels,  // Number of channels in the tile
+    const float* &inptr0,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* &outptr0,
+    const int matrix_stride
+) {
+  // We use 4 pointers to point to the starting position on each row and use
+  // three offsets to extract elements from each of the other 3 columns.
+  auto inptr1 = inptr0 + 0*input_row_stride;
+  auto inptr2 = inptr0 + 1*input_row_stride;
+
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  auto outptr1 = outptr0 + matrix_stride * 4;
+  auto outptr2 = outptr0 + matrix_stride * 8;
+  auto outptr3 = outptr0 + matrix_stride * 12;
+
+  for (; n_channels > 3; n_channels -= 4) {
+    asm volatile (
+        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
+        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
+        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
+        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
+        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
+        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
+        "X_33 .req v10\n"  "qX_33 .req q10\n"
+        "X_34 .req v11\n"  "qX_34 .req q11\n"
+        "X_41 .req v12\n"  "qX_41 .req q12\n"
+        "X_42 .req v13\n"  "qX_42 .req q13\n"
+        "X_43 .req v14\n"  "qX_43 .req q14\n"
+        "X_44 .req v15\n"  "qX_44 .req q15\n"
+        "xX_21 .req v20\n"
+        "xX_22 .req v21\n"
+        "xX_23 .req v22\n"
+        "xX_24 .req v23\n"
+        "xX_31 .req v24\n"
+        "xX_32 .req v25\n"
+        "xX_33 .req v26\n"
+        "xX_34 .req v27\n"
+        "xX_41 .req v28\n"
+        "xX_42 .req v29\n"
+        "xX_43 .req v30\n"
+        "xX_44 .req v31\n"
+        " U .req v0\n"
+        "qU .req q0\n"
+
+        // Load the tile, and compute compute the matrix xX
+        "ldr qX_21, [%x[inptr1]]\n"
+        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
+        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
+        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
+        "add %x[inptr1], %x[inptr1], #0x10\n"
+
+        "ldr qX_31, [%x[inptr2]]\n"
+        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
+        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
+        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
+        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
+        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
+        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
+        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], #0x10\n"
+
+        "ldr qX_41, [%x[inptr3]]\n"
+        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
+        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
+        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
+        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
+        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
+        "ldr qX_44, [%x[inptr3], %x[colstride3]]\n"
+        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
+        "add %x[inptr3], %x[inptr3], #0x10\n"
+
+        // Complete computing xX while beginning to compute and store
+        // $U = X.T x X$
+
+        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
+
+        "fneg U.4s, xX_31.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fneg U.4s, xX_32.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fneg U.4s, xX_33.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fneg U.4s, xX_34.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], #0x10\n"
+
+        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
+
+        "fadd U.4s, xX_21.4s, xX_31.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, xX_22.4s, xX_32.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fadd U.4s, xX_23.4s, xX_33.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fadd U.4s, xX_24.4s, xX_34.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], #0x10\n"
+
+        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
+
+        "fsub U.4s, xX_31.4s, xX_21.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fsub U.4s, xX_32.4s, xX_22.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, xX_33.4s, xX_23.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, xX_34.4s, xX_24.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], #0x10\n"
+
+        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
+
+        "fsub U.4s, xX_21.4s, xX_41.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fsub U.4s, xX_22.4s, xX_42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, xX_23.4s, xX_43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "fsub U.4s, xX_24.4s, xX_44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "add %x[outptr12], %x[outptr12], #0x10\n"
+
+        ".unreq qU\n"
+        ".unreq U\n"
+        ".unreq X_21\n"  ".unreq qX_21\n"
+        ".unreq X_22\n"  ".unreq qX_22\n"
+        ".unreq X_23\n"  ".unreq qX_23\n"
+        ".unreq X_24\n"  ".unreq qX_24\n"
+        ".unreq X_31\n"  ".unreq qX_31\n"
+        ".unreq X_32\n"  ".unreq qX_32\n"
+        ".unreq X_33\n"  ".unreq qX_33\n"
+        ".unreq X_34\n"  ".unreq qX_34\n"
+        ".unreq X_41\n"  ".unreq qX_41\n"
+        ".unreq X_42\n"  ".unreq qX_42\n"
+        ".unreq X_43\n"  ".unreq qX_43\n"
+        ".unreq X_44\n"  ".unreq qX_44\n"
+        ".unreq xX_21\n"
+        ".unreq xX_22\n"
+        ".unreq xX_23\n"
+        ".unreq xX_24\n"
+        ".unreq xX_31\n"
+        ".unreq xX_32\n"
+        ".unreq xX_33\n"
+        ".unreq xX_34\n"
+        ".unreq xX_41\n"
+        ".unreq xX_42\n"
+        ".unreq xX_43\n"
+        ".unreq xX_44\n"
+
+        : [inptr1] "+r" (inptr0),  // Offset for missing row
+          [inptr2] "+r" (inptr1),  // Offset for missing row
+          [inptr3] "+r" (inptr2),  // Offset for missing row
+          [outptr0] "+r" (outptr0),
+          [outptr4] "+r" (outptr1),
+          [outptr8] "+r" (outptr2),
+          [outptr12] "+r" (outptr3)
+        : [colstride1] "r" (input_col_stride * sizeof(float)),
+          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
+          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
+          [mstride1] "r" (matrix_stride * sizeof(float)),
+          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
+          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
+        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31"
+    );
+  }
+}
+
+// Pad left by 1
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 1, 0, 0, 4>(
+    int &n_channels,  // Number of channels in the tile
+    const float* &inptr0,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* &outptr0,
+    const int matrix_stride
+) {
+  // We use 4 pointers to point to the starting position on each row and use
+  // three offsets to extract elements from each of the other 3 columns.
+  auto inptr1 = inptr0 + 1*input_row_stride;
+  auto inptr2 = inptr0 + 2*input_row_stride;
+  auto inptr3 = inptr0 + 3*input_row_stride;
+
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  auto outptr1 = outptr0 + matrix_stride * 4;
+  auto outptr2 = outptr0 + matrix_stride * 8;
+  auto outptr3 = outptr0 + matrix_stride * 12;
+
+  for (; n_channels > 3; n_channels -= 4) {
+    asm volatile (
+        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
+        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
+        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
+        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
+        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
+        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
+        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
+        "X_33 .req v10\n"  "qX_33 .req q10\n"
+        "X_34 .req v11\n"  "qX_34 .req q11\n"
+        "X_42 .req v13\n"  "qX_42 .req q13\n"
+        "X_43 .req v14\n"  "qX_43 .req q14\n"
+        "X_44 .req v15\n"  "qX_44 .req q15\n"
+        "xX_11 .req v16\n"
+        "xX_12 .req v17\n"
+        "xX_13 .req v18\n"
+        "xX_14 .req v19\n"
+        "xX_21 .req v20\n"
+        "xX_22 .req v21\n"
+        "xX_23 .req v22\n"
+        "xX_24 .req v23\n"
+        "xX_31 .req v24\n"
+        "xX_32 .req v25\n"
+        "xX_33 .req v26\n"
+        "xX_34 .req v27\n"
+        "xX_41 .req v28\n"
+        "xX_42 .req v29\n"
+        "xX_43 .req v30\n"
+        "xX_44 .req v31\n"
+        " U .req v0\n"
+        "qU .req q0\n"
+
+        // Load the tile, and compute compute the matrix xX
+        "ldr qX_12, [%x[inptr0]]\n"
+        "ldr qX_13, [%x[inptr0], %x[colstride1]]\n"
+        "ldr qX_14, [%x[inptr0], %x[colstride2]]\n"
+        "add %x[inptr0], %x[inptr0], #0x10\n"
+
+        "fneg xX_11.4s, x_13.4s\n"
+        "ldr qX_22, [%x[inptr1]]\n"
+        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
+        "ldr qX_23, [%x[inptr1], %x[colstride1]]\n"
+        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
+        "ldr qX_24, [%x[inptr1], %x[colstride2]]\n"
+        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
+        "add %x[inptr1], %x[inptr1], #0x10\n"
+
+        "fneg xX_21.4s, x_23.4s\n"
+        "ldr qX_32, [%x[inptr2]]\n"
+        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
+        "ldr qX_33, [%x[inptr2], %x[colstride1]]\n"
+        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
+        "ldr qX_34, [%x[inptr2], %x[colstride2]]\n"
+        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], #0x10\n"
+
+        "fneg xX_31.4s, x_33.4s\n"
+        "ldr qX_42, [%x[inptr3]]\n"
+        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
+        "ldr qX_43, [%x[inptr3], %x[colstride1]]\n"
+        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
+        "ldr qX_44, [%x[inptr3], %x[colstride2]]\n"
+        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
+        "add %x[inptr3], %x[inptr3], #0x10\n"
+
+        // Complete computing xX while beginning to compute and store
+        // $U = X.T x X$
+
+        "fneg xX_41.4s, x_43.4s\n"
+
+        "fsub U.4s, xX_11.4s, xX_31.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fsub U.4s, xX_12.4s, xX_32.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, xX_13.4s, xX_33.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, xX_14.4s, xX_34.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], #0x10\n"
+
+        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
+
+        "fadd U.4s, xX_21.4s, xX_31.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, xX_22.4s, xX_32.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fadd U.4s, xX_23.4s, xX_33.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fadd U.4s, xX_24.4s, xX_34.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], #0x10\n"
+
+        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
+
+        "fsub U.4s, xX_31.4s, xX_21.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fsub U.4s, xX_32.4s, xX_22.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, xX_33.4s, xX_23.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, xX_34.4s, xX_24.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], #0x10\n"
+
+        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
+
+        "fsub U.4s, xX_21.4s, xX_41.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fsub U.4s, xX_22.4s, xX_42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, xX_23.4s, xX_43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "fsub U.4s, xX_24.4s, xX_44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "add %x[outptr12], %x[outptr12], #0x10\n"
+
+        ".unreq X_12\n"  ".unreq qX_12\n"
+        ".unreq X_13\n"  ".unreq qX_13\n"
+        ".unreq X_14\n"  ".unreq qX_14\n"
+        ".unreq X_22\n"  ".unreq qX_22\n"
+        ".unreq X_23\n"  ".unreq qX_23\n"
+        ".unreq X_24\n"  ".unreq qX_24\n"
+        ".unreq X_32\n"  ".unreq qX_32\n"
+        ".unreq X_33\n"  ".unreq qX_33\n"
+        ".unreq X_34\n"  ".unreq qX_34\n"
+        ".unreq X_42\n"  ".unreq qX_42\n"
+        ".unreq X_43\n"  ".unreq qX_43\n"
+        ".unreq X_44\n"  ".unreq qX_44\n"
+        ".unreq xX_11\n"
+        ".unreq xX_12\n"
+        ".unreq xX_13\n"
+        ".unreq xX_14\n"
+        ".unreq xX_21\n"
+        ".unreq xX_22\n"
+        ".unreq xX_23\n"
+        ".unreq xX_24\n"
+        ".unreq xX_31\n"
+        ".unreq xX_32\n"
+        ".unreq xX_33\n"
+        ".unreq xX_34\n"
+        ".unreq xX_41\n"
+        ".unreq xX_42\n"
+        ".unreq xX_43\n"
+        ".unreq xX_44\n"
+        ".unreq U\n"
+        ".unreq qU\n"
+
+        : [inptr0] "+r" (inptr0),
+          [inptr1] "+r" (inptr1),
+          [inptr2] "+r" (inptr2),
+          [inptr3] "+r" (inptr3),
+          [outptr0] "+r" (outptr0),
+          [outptr4] "+r" (outptr1),
+          [outptr8] "+r" (outptr2),
+          [outptr12] "+r" (outptr3)
+        : [colstride1] "r" (input_col_stride * sizeof(float)),
+          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
+          [mstride1] "r" (matrix_stride * sizeof(float)),
+          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
+          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
+        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31"
+    );
+  }
+}
+
+// Pad bottom by 1
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 1, 0, 4>(
+    int &n_channels,  // Number of channels in the tile
+    const float* &inptr0,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* &outptr0,
+    const int matrix_stride
+) {
+  // We use 4 pointers to point to the starting position on each row and use
+  // three offsets to extract elements from each of the other 3 columns.
+  auto inptr1 = inptr0 + 1*input_row_stride;
+  auto inptr2 = inptr0 + 2*input_row_stride;
+
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  auto outptr1 = outptr0 + matrix_stride * 4;
+  auto outptr2 = outptr0 + matrix_stride * 8;
+  auto outptr3 = outptr0 + matrix_stride * 12;
+
+  for (; n_channels > 3; n_channels -= 4) {
+    asm volatile (
+        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
+        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
+        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
+        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
+        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
+        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
+        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
+        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
+        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
+        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
+        "X_33 .req v10\n"  "qX_33 .req q10\n"
+        "X_34 .req v11\n"  "qX_34 .req q11\n"
+        "xX_11 .req v16\n"
+        "xX_12 .req v17\n"
+        "xX_13 .req v18\n"
+        "xX_14 .req v19\n"
+        "xX_21 .req v20\n" "qxX_21 .req q20\n"
+        "xX_22 .req v21\n" "qxX_22 .req q21\n"
+        "xX_23 .req v22\n" "qxX_23 .req q22\n"
+        "xX_24 .req v23\n" "qxX_24 .req q23\n"
+        "xX_31 .req v24\n"
+        "xX_32 .req v25\n"
+        "xX_33 .req v26\n"
+        "xX_34 .req v27\n"
+        " U .req v0\n"
+        "qU .req q0\n"
+
+        // Load the tile, and compute compute the matrix xX
+        "ldr qX_11, [%x[inptr0]]\n"
+        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
+        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
+        "ldr qX_14, [%x[inptr0], %x[colstride3]]\n"
+        "add %x[inptr0], %x[inptr0], #0x10\n"
+
+        "ldr qX_21, [%x[inptr1]]\n"
+        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
+        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
+        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
+        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
+        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
+        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
+        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
+        "add %x[inptr1], %x[inptr1], #0x10\n"
+
+        "ldr qX_31, [%x[inptr2]]\n"
+        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
+        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
+        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
+        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
+        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
+        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
+        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
+        "add %x[inptr2], %x[inptr2], #0x10\n"
+
+        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
+        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
+        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
+        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
+
+        // Complete computing xX while beginning to compute and store
+        // $U = X.T x X$
+
+        "fsub U.4s, xX_11.4s, xX_31.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fsub U.4s, xX_12.4s, xX_32.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, xX_13.4s, xX_33.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, xX_14.4s, xX_34.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], #0x10\n"
+
+        "fadd U.4s, xX_21.4s, xX_31.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, xX_22.4s, xX_32.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fadd U.4s, xX_23.4s, xX_33.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fadd U.4s, xX_24.4s, xX_34.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], #0x10\n"
+
+        "fsub U.4s, xX_31.4s, xX_21.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fsub U.4s, xX_32.4s, xX_22.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, xX_33.4s, xX_23.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, xX_34.4s, xX_24.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], #0x10\n"
+
+        "str qxX_21, [%x[outptr12]]\n"
+        "str qxX_22, [%x[outptr12], %x[mstride1]]\n"
+        "str qxX_23, [%x[outptr12], %x[mstride2]]\n"
+        "str qxX_24, [%x[outptr12], %x[mstride3]]\n"
+        "add %x[outptr12], %x[outptr12], #0x10\n"
+
+        ".unreq qU\n"
+        ".unreq U\n"
+        ".unreq X_11\n"  ".unreq qX_11\n"
+        ".unreq X_12\n"  ".unreq qX_12\n"
+        ".unreq X_13\n"  ".unreq qX_13\n"
+        ".unreq X_14\n"  ".unreq qX_14\n"
+        ".unreq X_21\n"  ".unreq qX_21\n"
+        ".unreq X_22\n"  ".unreq qX_22\n"
+        ".unreq X_23\n"  ".unreq qX_23\n"
+        ".unreq X_24\n"  ".unreq qX_24\n"
+        ".unreq X_31\n"  ".unreq qX_31\n"
+        ".unreq X_32\n"  ".unreq qX_32\n"
+        ".unreq X_33\n"  ".unreq qX_33\n"
+        ".unreq X_34\n"  ".unreq qX_34\n"
+        ".unreq xX_11\n"
+        ".unreq xX_12\n"
+        ".unreq xX_13\n"
+        ".unreq xX_14\n"
+        ".unreq xX_21\n" ".unreq qxX_21\n"
+        ".unreq xX_22\n" ".unreq qxX_22\n"
+        ".unreq xX_23\n" ".unreq qxX_23\n"
+        ".unreq xX_24\n" ".unreq qxX_24\n"
+        ".unreq xX_31\n"
+        ".unreq xX_32\n"
+        ".unreq xX_33\n"
+        ".unreq xX_34\n"
+
+        : [inptr0] "+r" (inptr0),
+          [inptr1] "+r" (inptr1),
+          [inptr2] "+r" (inptr2),
+          [outptr0] "+r" (outptr0),
+          [outptr4] "+r" (outptr1),
+          [outptr8] "+r" (outptr2),
+          [outptr12] "+r" (outptr3)
+        : [colstride1] "r" (input_col_stride * sizeof(float)),
+          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
+          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
+          [mstride1] "r" (matrix_stride * sizeof(float)),
+          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
+          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
+        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31"
+    );
+  }
+}
+
+// Pad right by 1
+template <>
+template <>
+inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 0, 1, 4>(
+    int &n_channels,  // Number of channels in the tile
+    const float* &inptr0,
+    const int input_row_stride,
+    const int input_col_stride,
+    float* &outptr0,
+    const int matrix_stride
+) {
+  // We use 4 pointers to point to the starting position on each row and use
+  // three offsets to extract elements from each of the other 3 columns.
+  auto inptr1 = inptr0 + 1*input_row_stride;
+  auto inptr2 = inptr0 + 2*input_row_stride;
+  auto inptr3 = inptr0 + 3*input_row_stride;
+
+  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
+  // offsets to access the intermediate matrices.
+  auto outptr1 = outptr0 + matrix_stride * 4;
+  auto outptr2 = outptr0 + matrix_stride * 8;
+  auto outptr3 = outptr0 + matrix_stride * 12;
+
+  for (; n_channels > 3; n_channels -= 4) {
+    asm volatile (
+        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
+        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
+        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
+        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
+        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
+        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
+        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
+        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
+        "X_33 .req v10\n"  "qX_33 .req q10\n"
+        "X_41 .req v12\n"  "qX_41 .req q12\n"
+        "X_42 .req v13\n"  "qX_42 .req q13\n"
+        "X_43 .req v14\n"  "qX_43 .req q14\n"
+        "xX_11 .req v16\n"
+        "xX_12 .req v17\n"
+        "xX_13 .req v18\n"
+        "xX_14 .req x_12\n"
+        "xX_21 .req v20\n"
+        "xX_22 .req v21\n"
+        "xX_23 .req v22\n"
+        "xX_24 .req x_22\n"
+        "xX_31 .req v24\n"
+        "xX_32 .req v25\n"
+        "xX_33 .req v26\n"
+        "xX_34 .req x_32\n"
+        "xX_41 .req v28\n"
+        "xX_42 .req v29\n"
+        "xX_43 .req v30\n"
+        "xX_44 .req x_42\n"
+        " U .req v0\n"
+        "qU .req q0\n"
+
+        // Load the tile, and compute compute the matrix xX
+        "ldr qX_11, [%x[inptr0]]\n"
+        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
+        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
+        "add %x[inptr0], %x[inptr0], #0x10\n"
+
+        "ldr qX_21, [%x[inptr1]]\n"
+        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
+        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
+        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
+        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
+        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
+        "add %x[inptr1], %x[inptr1], #0x10\n"
+
+        "ldr qX_31, [%x[inptr2]]\n"
+        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
+        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
+        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
+        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
+        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
+        "add %x[inptr2], %x[inptr2], #0x10\n"
+
+        "ldr qX_41, [%x[inptr3]]\n"
+        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
+        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
+        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
+        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
+        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
+        "add %x[inptr3], %x[inptr3], #0x10\n"
+
+        // Complete computing xX while beginning to compute and store
+        // $U = X.T x X$
+
+        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
+
+        "fsub U.4s, xX_11.4s, xX_31.4s\n"
+        "str qU, [%x[outptr0]]\n"
+        "fsub U.4s, xX_12.4s, xX_32.4s\n"
+        "str qU, [%x[outptr0], %x[mstride1]]\n"
+        "fsub U.4s, xX_13.4s, xX_33.4s\n"
+        "str qU, [%x[outptr0], %x[mstride2]]\n"
+        "fsub U.4s, xX_14.4s, xX_34.4s\n"
+        "str qU, [%x[outptr0], %x[mstride3]]\n"
+        "add %x[outptr0], %x[outptr0], #0x10\n"
+
+        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
+
+        "fadd U.4s, xX_21.4s, xX_31.4s\n"
+        "str qU, [%x[outptr4]]\n"
+        "fadd U.4s, xX_22.4s, xX_32.4s\n"
+        "str qU, [%x[outptr4], %x[mstride1]]\n"
+        "fadd U.4s, xX_23.4s, xX_33.4s\n"
+        "str qU, [%x[outptr4], %x[mstride2]]\n"
+        "fadd U.4s, xX_24.4s, xX_34.4s\n"
+        "str qU, [%x[outptr4], %x[mstride3]]\n"
+        "add %x[outptr4], %x[outptr4], #0x10\n"
+
+        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
+
+        "fsub U.4s, xX_31.4s, xX_21.4s\n"
+        "str qU, [%x[outptr8]]\n"
+        "fsub U.4s, xX_32.4s, xX_22.4s\n"
+        "str qU, [%x[outptr8], %x[mstride1]]\n"
+        "fsub U.4s, xX_33.4s, xX_23.4s\n"
+        "str qU, [%x[outptr8], %x[mstride2]]\n"
+        "fsub U.4s, xX_34.4s, xX_24.4s\n"
+        "str qU, [%x[outptr8], %x[mstride3]]\n"
+        "add %x[outptr8], %x[outptr8], #0x10\n"
+
+        "fsub U.4s, xX_21.4s, xX_41.4s\n"
+        "str qU, [%x[outptr12]]\n"
+        "fsub U.4s, xX_22.4s, xX_42.4s\n"
+        "str qU, [%x[outptr12], %x[mstride1]]\n"
+        "fsub U.4s, xX_23.4s, xX_43.4s\n"
+        "str qU, [%x[outptr12], %x[mstride2]]\n"
+        "fsub U.4s, xX_24.4s, xX_44.4s\n"
+        "str qU, [%x[outptr12], %x[mstride3]]\n"
+        "add %x[outptr12], %x[outptr12], #0x10\n"
+
+        ".unreq qU\n"
+        ".unreq U\n"
+        ".unreq X_11\n"  ".unreq qX_11\n"
+        ".unreq X_12\n"  ".unreq qX_12\n"
+        ".unreq X_13\n"  ".unreq qX_13\n"
+        ".unreq X_21\n"  ".unreq qX_21\n"
+        ".unreq X_22\n"  ".unreq qX_22\n"
+        ".unreq X_23\n"  ".unreq qX_23\n"
+        ".unreq X_31\n"  ".unreq qX_31\n"
+        ".unreq X_32\n"  ".unreq qX_32\n"
+        ".unreq X_33\n"  ".unreq qX_33\n"
+        ".unreq X_41\n"  ".unreq qX_41\n"
+        ".unreq X_42\n"  ".unreq qX_42\n"
+        ".unreq X_43\n"  ".unreq qX_43\n"
+        ".unreq xX_11\n"
+        ".unreq xX_12\n"
+        ".unreq xX_13\n"
+        ".unreq xX_14\n"
+        ".unreq xX_21\n"
+        ".unreq xX_22\n"
+        ".unreq xX_23\n"
+        ".unreq xX_24\n"
+        ".unreq xX_31\n"
+        ".unreq xX_32\n"
+        ".unreq xX_33\n"
+        ".unreq xX_34\n"
+        ".unreq xX_41\n"
+        ".unreq xX_42\n"
+        ".unreq xX_43\n"
+        ".unreq xX_44\n"
+
+        : [inptr0] "+r" (inptr0),
+          [inptr1] "+r" (inptr1),
+          [inptr2] "+r" (inptr2),
+          [inptr3] "+r" (inptr3),
+          [outptr0] "+r" (outptr0),
+          [outptr4] "+r" (outptr1),
+          [outptr8] "+r" (outptr2),
+          [outptr12] "+r" (outptr3)
+        : [colstride1] "r" (input_col_stride * sizeof(float)),
+          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
+          [mstride1] "r" (matrix_stride * sizeof(float)),
+          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
+          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
+        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+          "v30", "v31"
+    );
+  }
+}
+}
+#endif

diff --git a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp
new file mode 100644
index 0000000..033442a
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp

@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+namespace winograd {
+  /* Transform a kernel into the Winograd domain.
+   *
+   * NOTE: It is assumed that the kernel is in the form [height x width x
+   * input_channels x output_channel].
+   */
+  template <typename T>
+  struct winograd2x2_3x3_gemm_kernel_transform_impl{
+    static void execute(
+      const KernelShape &shape,
+      const T* const kernel,
+      T* const matrix_base,
+      const int matrix_stride,
+      const int matrix_row_stride
+    );
+
+    protected:
+    template <const int output_channel_tail>
+    static void transform_kernel(
+      const T* const kernel,
+      const int n_input_channels,
+      const int n_output_channels,
+      T* const matrix_base,
+      const int matrix_stride,
+      const int matrix_row_stride
+    );
+  };
+}
+
+/*****************************************************************************/
+/* Transform a fp32 kernel into the Winograd domain.
+ */
+#include "kernel_2x2_3x3/a64_float.hpp"  // AArch64 specialisations
+
+namespace winograd
+{
+template <>
+inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::execute(
+  const KernelShape &shape,
+  const float* const kernel,
+  float* const matrix_base,
+  const int matrix_stride,
+  const int matrix_row_stride
+) {
+  // Delegate based on tail size
+  const int n_input_channels = shape.n_input_channels;
+  const int n_output_channels = shape.n_output_channels;
+
+  switch (n_output_channels % 4) {
+    case 0:
+      transform_kernel<0>(
+        kernel, n_input_channels, n_output_channels,
+        matrix_base, matrix_stride, matrix_row_stride
+      );
+      break;
+    case 1:
+      transform_kernel<1>(
+        kernel, n_input_channels, n_output_channels,
+        matrix_base, matrix_stride, matrix_row_stride
+      );
+      break;
+    case 2:
+      transform_kernel<2>(
+        kernel, n_input_channels, n_output_channels,
+        matrix_base, matrix_stride, matrix_row_stride
+      );
+      break;
+    case 3:
+      transform_kernel<3>(
+        kernel, n_input_channels, n_output_channels,
+        matrix_base, matrix_stride, matrix_row_stride
+      );
+      break;
+    default:
+        ARM_COMPUTE_ERROR("Cannot happen");
+        break;
+  }
+}
+
+template <>
+template<const int output_channel_tail>
+inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel(
+    const float* const kernel,
+    const int n_input_channels,
+    const int n_output_channels,
+    float* const matrix_base,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  // Use one input pointer for each row of the kernel, use two additional
+  // offsets to extract columns.
+  const int kernel_col_stride = n_input_channels * n_output_channels;
+  const int kernel_row_stride = 3 * kernel_col_stride;
+  const float *inptr0 = kernel;
+  const float *inptr1 = kernel + kernel_row_stride;
+  const float *inptr2 = kernel + kernel_row_stride*2;
+
+  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
+  // offsets to extract further matrices.
+  float  *outptr0 = matrix_base;
+  float  *outptr4 = matrix_base + mstride * 4;
+  float  *outptr8 = matrix_base + mstride * 8;
+  float *outptr12 = matrix_base + mstride * 12;
+
+  // For every input channel
+  for (int in_c = 0; in_c < n_input_channels; in_c++) {
+    // For every output channel
+    for (int c = 0; c < n_output_channels; c++) {
+      // Read in the kernel
+      float w11 = inptr0[0], w12 = inptr0[kernel_col_stride], w13 = inptr0[kernel_col_stride*2];
+      float w21 = inptr1[0], w22 = inptr1[kernel_col_stride], w23 = inptr1[kernel_col_stride*2];
+      float w31 = inptr2[0], w32 = inptr2[kernel_col_stride], w33 = inptr2[kernel_col_stride*2];
+
+      // Progress input pointers
+      inptr0++;
+      inptr1++;
+      inptr2++;
+
+      // Compute the kernel W w, note we need only compute the middle two rows
+      // (2 and 3) because the first and last rows are merely copies of values
+      // from the matrix w.
+      float Ww11 = w11, Ww12 = w12, Ww13 = w13;
+      float Ww21 = 0.5*(w11 + w21 + w31), Ww22 = 0.5*(w12 + w22 + w32), Ww23 = 0.5*(w13 + w23 + w33);
+      float Ww31 = 0.5*(w11 - w21 + w31), Ww32 = 0.5*(w12 - w22 + w32), Ww33 = 0.5*(w13 - w23 + w33);
+      float Ww41 = w31, Ww42 = w32, Ww43 = w33;
+
+      // Hence compute W w W.T; again note we need compute only the middle two
+      // columns since the first and last columns are copies of the first and
+      // last columns of the previous matrix.
+      float WwWT11 = Ww11, WwWT12 = 0.5*(Ww11 + Ww12 + Ww13), WwWT13 = 0.5*(Ww11 - Ww12 + Ww13), WwWT14 = Ww13;
+      float WwWT21 = Ww21, WwWT22 = 0.5*(Ww21 + Ww22 + Ww23), WwWT23 = 0.5*(Ww21 - Ww22 + Ww23), WwWT24 = Ww23;
+      float WwWT31 = Ww31, WwWT32 = 0.5*(Ww31 + Ww32 + Ww33), WwWT33 = 0.5*(Ww31 - Ww32 + Ww33), WwWT34 = Ww33;
+      float WwWT41 = Ww41, WwWT42 = 0.5*(Ww41 + Ww42 + Ww43), WwWT43 = 0.5*(Ww41 - Ww42 + Ww43), WwWT44 = Ww43;
+
+      // Store the computed weights
+      outptr0[0 * mstride] = WwWT11;
+      outptr0[1 * mstride] = WwWT12;
+      outptr0[2 * mstride] = WwWT13;
+      outptr0[3 * mstride] = WwWT14;
+
+      outptr4[0 * mstride] = WwWT21;
+      outptr4[1 * mstride] = WwWT22;
+      outptr4[2 * mstride] = WwWT23;
+      outptr4[3 * mstride] = WwWT24;
+
+      outptr8[0 * mstride] = WwWT31;
+      outptr8[1 * mstride] = WwWT32;
+      outptr8[2 * mstride] = WwWT33;
+      outptr8[3 * mstride] = WwWT34;
+
+      outptr12[0 * mstride] = WwWT41;
+      outptr12[1 * mstride] = WwWT42;
+      outptr12[2 * mstride] = WwWT43;
+      outptr12[3 * mstride] = WwWT44;
+
+      // Progress output pointers
+      outptr0++;
+      outptr4++;
+      outptr8++;
+      outptr12++;
+    }
+
+    // Progression to complete stride
+    outptr0 += matrix_row_stride - n_output_channels;
+    outptr4 += matrix_row_stride - n_output_channels;
+    outptr8 += matrix_row_stride - n_output_channels;
+    outptr12 += matrix_row_stride - n_output_channels;
+  }
+}
+}

diff --git a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp
new file mode 100644
index 0000000..3dd62d1
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp

@@ -0,0 +1,822 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+namespace winograd {
+template <>
+template <>
+inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<0>(
+    const float* const kernel,
+    const int n_input_channels,
+    const int n_output_channels,
+    float* const matrix_base,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  // Use one input pointer for each row of the kernel, use two additional
+  // offsets to extract columns.
+  const int kernel_col_stride = n_input_channels * n_output_channels;
+  const int kernel_row_stride = 3 * kernel_col_stride;
+  const float *inptr0 = kernel;
+  const float *inptr1 = kernel + kernel_row_stride;
+  const float *inptr2 = kernel + kernel_row_stride*2;
+
+  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
+  // offsets to extract further matrices.
+  float  *outptr0 = matrix_base;
+  float  *outptr4 = matrix_base + mstride * 4;
+  float  *outptr8 = matrix_base + mstride * 8;
+  float *outptr12 = matrix_base + mstride * 12;
+
+  // For every input channel
+  for (int in_c = 0; in_c < n_input_channels; in_c++) {
+    int n_remaining_channels = n_output_channels;
+
+    asm volatile (
+        // Registers into which to read the kernel
+        "w_11 .req v0\n"  "qw_11 .req q0\n"
+        "w_12 .req v1\n"  "qw_12 .req q1\n"
+        "w_13 .req v2\n"  "qw_13 .req q2\n"
+        "w_21 .req v3\n"  "qw_21 .req q3\n"
+        "w_22 .req v4\n"  "qw_22 .req q4\n"
+        "w_23 .req v5\n"  "qw_23 .req q5\n"
+        "w_31 .req v6\n"  "qw_31 .req q6\n"
+        "w_32 .req v7\n"  "qw_32 .req q7\n"
+        "w_33 .req v8\n"  "qw_33 .req q8\n"
+
+        // Transformed matrix Ww
+        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
+        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
+        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
+        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
+
+        // Output matrix U = WwWT
+        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
+        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
+        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
+        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
+
+        // Storage view of output matrices
+        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
+        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
+        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
+        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
+
+        "half .req v23\n"  // {0.5, ..., 0.5}
+        "dup half.4s, %w[one_half]\n"
+        "scratch .req v24\n"
+
+        "1:"
+          // Load tile of the kernel
+          "ldr qw_11, [%x[inptr0]]\n"
+          "str qU11, [%x[outptr0]]\n"
+          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
+          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
+          "str qU14, [%x[outptr0], %x[mstride3]]\n"
+          "add %x[inptr0], %x[inptr0], #0x10\n"
+
+          "ldr qw_21, [%x[inptr1]]\n"
+          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
+          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
+          "add %x[inptr1], %x[inptr1], #0x10\n"
+
+          "ldr qw_31, [%x[inptr2]]\n"
+          "str qU41, [%x[outptr12]]\n"
+          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
+          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
+          "str qU44, [%x[outptr12], %x[mstride3]]\n"
+          "add %x[inptr2], %x[inptr2], #0x10\n"
+
+          // Compute 2nd and 3rd rows of Ww
+          "fadd scratch.4s, w_11.4s, w_31.4s\n"
+          "fmul Ww21.4s, scratch.4s, half.4s\n"
+          "fmla Ww21.4s, w_21.4s, half.4s\n"
+          "str qU21, [%x[outptr4]]\n"
+          "fmul Ww31.4s, scratch.4s, half.4s\n"
+          "fmls Ww31.4s, w_21.4s, half.4s\n"
+          "str qU31, [%x[outptr8]]\n"
+
+          "fadd scratch.4s, w_12.4s, w_32.4s\n"
+          "fmul Ww22.4s, scratch.4s, half.4s\n"
+          "fmla Ww22.4s, w_22.4s, half.4s\n"
+          "fmul Ww32.4s, scratch.4s, half.4s\n"
+          "fmls Ww32.4s, w_22.4s, half.4s\n"
+
+          "fadd scratch.4s, w_13.4s, w_33.4s\n"
+          "fmul Ww23.4s, scratch.4s, half.4s\n"
+          "fmla Ww23.4s, w_23.4s, half.4s\n"
+          "str qU24, [%x[outptr4], %x[mstride3]]\n"
+          "fmul Ww33.4s, scratch.4s, half.4s\n"
+          "fmls Ww33.4s, w_23.4s, half.4s\n"
+          "str qU34, [%x[outptr8], %x[mstride3]]\n"
+
+          // Compute and store U, only need to compute the 2nd and 3rd columns
+          // of U and update output pointers
+          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
+          "fmul U12.4s, scratch.4s, half.4s\n"
+          "fmla U12.4s, Ww12.4s, half.4s\n"
+          "str qU12, [%x[outptr0], %x[mstride1]]\n"
+          "fmul U13.4s, scratch.4s, half.4s\n"
+          "fmls U13.4s, Ww12.4s, half.4s\n"
+          "str qU13, [%x[outptr0], %x[mstride2]]\n"
+          "add  %x[outptr0],  %x[outptr0], #0x10\n"
+
+          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
+          "fmul U22.4s, scratch.4s, half.4s\n"
+          "fmla U22.4s, Ww22.4s, half.4s\n"
+          "str qU22, [%x[outptr4], %x[mstride1]]\n"
+          "fmul U23.4s, scratch.4s, half.4s\n"
+          "fmls U23.4s, Ww22.4s, half.4s\n"
+          "str qU23, [%x[outptr4], %x[mstride2]]\n"
+          "add  %x[outptr4],  %x[outptr4], #0x10\n"
+
+          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
+          "fmul U32.4s, scratch.4s, half.4s\n"
+          "fmla U32.4s, Ww32.4s, half.4s\n"
+          "str qU32, [%x[outptr8], %x[mstride1]]\n"
+          "fmul U33.4s, scratch.4s, half.4s\n"
+          "fmls U33.4s, Ww32.4s, half.4s\n"
+          "str qU33, [%x[outptr8], %x[mstride2]]\n"
+          "add  %x[outptr8],  %x[outptr8], #0x10\n"
+
+          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
+          "fmul U42.4s, scratch.4s, half.4s\n"
+          "fmla U42.4s, Ww42.4s, half.4s\n"
+          "str qU42, [%x[outptr12], %x[mstride1]]\n"
+          "fmul U43.4s, scratch.4s, half.4s\n"
+          "fmls U43.4s, Ww42.4s, half.4s\n"
+          "str qU43, [%x[outptr12], %x[mstride2]]\n"
+          "add %x[outptr12], %x[outptr12], #0x10\n"
+
+          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
+          "bne 1b\n"
+
+        // Clear aliases
+        ".unreq half\n"
+        ".unreq scratch\n"
+        ".unreq w_11\n"  ".unreq qw_11\n"
+        ".unreq w_12\n"  ".unreq qw_12\n"
+        ".unreq w_13\n"  ".unreq qw_13\n"
+        ".unreq w_21\n"  ".unreq qw_21\n"
+        ".unreq w_22\n"  ".unreq qw_22\n"
+        ".unreq w_23\n"  ".unreq qw_23\n"
+        ".unreq w_31\n"  ".unreq qw_31\n"
+        ".unreq w_32\n"  ".unreq qw_32\n"
+        ".unreq w_33\n"  ".unreq qw_33\n"
+        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
+        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
+        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
+        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
+        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
+        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
+        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
+        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
+        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
+        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
+        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
+        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
+
+      : [inptr0] "+r" (inptr0),
+        [inptr1] "+r" (inptr1),
+        [inptr2] "+r" (inptr2),
+        [outptr0] "+r" (outptr0),
+        [outptr4] "+r" (outptr4),
+        [outptr8] "+r" (outptr8),
+        [outptr12] "+r" (outptr12),
+        [n_remaining_channels] "+r" (n_remaining_channels)
+      : [mstride1] "r" (sizeof(float) * mstride),
+        [mstride2] "r" (sizeof(float) * mstride * 2),
+        [mstride3] "r" (sizeof(float) * mstride * 3),
+        [colstride1] "r" (sizeof(float) * kernel_col_stride),
+        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
+        [one_half] "r" (0.5f)
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24"
+    );
+
+    // Progression to complete stride
+    outptr0 += matrix_row_stride - n_output_channels;
+    outptr4 += matrix_row_stride - n_output_channels;
+    outptr8 += matrix_row_stride - n_output_channels;
+    outptr12 += matrix_row_stride - n_output_channels;
+  }
+}
+
+template <>
+template <>
+inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<2>(
+    const float* const kernel,
+    const int n_input_channels,
+    const int n_output_channels,
+    float* const matrix_base,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  // Use one input pointer for each row of the kernel, use two additional
+  // offsets to extract columns.
+  const int kernel_col_stride = n_input_channels * n_output_channels;
+  const int kernel_row_stride = 3 * kernel_col_stride;
+  const float *inptr0 = kernel;
+  const float *inptr1 = kernel + kernel_row_stride;
+  const float *inptr2 = kernel + kernel_row_stride*2;
+
+  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
+  // offsets to extract further matrices.
+  float  *outptr0 = matrix_base;
+  float  *outptr4 = matrix_base + mstride * 4;
+  float  *outptr8 = matrix_base + mstride * 8;
+  float *outptr12 = matrix_base + mstride * 12;
+
+  // For every input channel
+  for (int in_c = 0; in_c < n_input_channels; in_c++) {
+    int n_remaining_channels = n_output_channels;
+
+    asm volatile (
+        // Registers into which to read the kernel
+        "w_11 .req v0\n"  "qw_11 .req q0\n"  "dw_11 .req d0\n"
+        "w_12 .req v1\n"  "qw_12 .req q1\n"  "dw_12 .req d1\n"
+        "w_13 .req v2\n"  "qw_13 .req q2\n"  "dw_13 .req d2\n"
+        "w_21 .req v3\n"  "qw_21 .req q3\n"  "dw_21 .req d3\n"
+        "w_22 .req v4\n"  "qw_22 .req q4\n"  "dw_22 .req d4\n"
+        "w_23 .req v5\n"  "qw_23 .req q5\n"  "dw_23 .req d5\n"
+        "w_31 .req v6\n"  "qw_31 .req q6\n"  "dw_31 .req d6\n"
+        "w_32 .req v7\n"  "qw_32 .req q7\n"  "dw_32 .req d7\n"
+        "w_33 .req v8\n"  "qw_33 .req q8\n"  "dw_33 .req d8\n"
+
+        // Transformed matrix Ww
+        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
+        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
+        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
+        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
+
+        // Output matrix U = WwWT
+        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
+        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
+        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
+        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
+
+        // Storage view of output matrices
+        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
+        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
+        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
+        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
+
+        "dU11 .req   d0\n"   "dU12 .req d15\n"  "dU13 .req d16\n"  "dU14 .req   d2\n"
+        "dU21 .req   d9\n"   "dU22 .req d17\n"  "dU23 .req d18\n"  "dU24 .req  d11\n"
+        "dU31 .req  d12\n"   "dU32 .req d19\n"  "dU33 .req d20\n"  "dU34 .req  d14\n"
+        "dU41 .req   d6\n"   "dU42 .req d21\n"  "dU43 .req d22\n"  "dU44 .req   d8\n"
+
+        "half .req v23\n"  // {0.5, ..., 0.5}
+        "dup half.4s, %w[one_half]\n"
+        "scratch .req v24\n"
+        
+        // Subtract the tail from the number of remaining channels and jump to
+        // the tail if necessary.
+        "subs %x[n_remaining_channels], %x[n_remaining_channels], #2\n"
+        "beq 2f\n"
+
+        "1:"
+          // Load tile of the kernel
+          "ldr qw_11, [%x[inptr0]]\n"
+          "str qU11, [%x[outptr0]]\n"
+          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
+          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
+          "str qU14, [%x[outptr0], %x[mstride3]]\n"
+          "add %x[inptr0], %x[inptr0], #0x10\n"
+
+          "ldr qw_21, [%x[inptr1]]\n"
+          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
+          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
+          "add %x[inptr1], %x[inptr1], #0x10\n"
+
+          "ldr qw_31, [%x[inptr2]]\n"
+          "str qU41, [%x[outptr12]]\n"
+          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
+          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
+          "str qU44, [%x[outptr12], %x[mstride3]]\n"
+          "add %x[inptr2], %x[inptr2], #0x10\n"
+
+          // Compute 2nd and 3rd rows of Ww
+          "fadd scratch.4s, w_11.4s, w_31.4s\n"
+          "fmul Ww21.4s, scratch.4s, half.4s\n"
+          "fmla Ww21.4s, w_21.4s, half.4s\n"
+          "str qU21, [%x[outptr4]]\n"
+          "fmul Ww31.4s, scratch.4s, half.4s\n"
+          "fmls Ww31.4s, w_21.4s, half.4s\n"
+          "str qU31, [%x[outptr8]]\n"
+
+          "fadd scratch.4s, w_12.4s, w_32.4s\n"
+          "fmul Ww22.4s, scratch.4s, half.4s\n"
+          "fmla Ww22.4s, w_22.4s, half.4s\n"
+          "fmul Ww32.4s, scratch.4s, half.4s\n"
+          "fmls Ww32.4s, w_22.4s, half.4s\n"
+
+          "fadd scratch.4s, w_13.4s, w_33.4s\n"
+          "fmul Ww23.4s, scratch.4s, half.4s\n"
+          "fmla Ww23.4s, w_23.4s, half.4s\n"
+          "str qU24, [%x[outptr4], %x[mstride3]]\n"
+          "fmul Ww33.4s, scratch.4s, half.4s\n"
+          "fmls Ww33.4s, w_23.4s, half.4s\n"
+          "str qU34, [%x[outptr8], %x[mstride3]]\n"
+
+          // Compute and store U, only need to compute the 2nd and 3rd columns
+          // of U and update output pointers
+          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
+          "fmul U12.4s, scratch.4s, half.4s\n"
+          "fmla U12.4s, Ww12.4s, half.4s\n"
+          "str qU12, [%x[outptr0], %x[mstride1]]\n"
+          "fmul U13.4s, scratch.4s, half.4s\n"
+          "fmls U13.4s, Ww12.4s, half.4s\n"
+          "str qU13, [%x[outptr0], %x[mstride2]]\n"
+          "add  %x[outptr0],  %x[outptr0], #0x10\n"
+
+          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
+          "fmul U22.4s, scratch.4s, half.4s\n"
+          "fmla U22.4s, Ww22.4s, half.4s\n"
+          "str qU22, [%x[outptr4], %x[mstride1]]\n"
+          "fmul U23.4s, scratch.4s, half.4s\n"
+          "fmls U23.4s, Ww22.4s, half.4s\n"
+          "str qU23, [%x[outptr4], %x[mstride2]]\n"
+          "add  %x[outptr4],  %x[outptr4], #0x10\n"
+
+          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
+          "fmul U32.4s, scratch.4s, half.4s\n"
+          "fmla U32.4s, Ww32.4s, half.4s\n"
+          "str qU32, [%x[outptr8], %x[mstride1]]\n"
+          "fmul U33.4s, scratch.4s, half.4s\n"
+          "fmls U33.4s, Ww32.4s, half.4s\n"
+          "str qU33, [%x[outptr8], %x[mstride2]]\n"
+          "add  %x[outptr8],  %x[outptr8], #0x10\n"
+
+          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
+          "fmul U42.4s, scratch.4s, half.4s\n"
+          "fmla U42.4s, Ww42.4s, half.4s\n"
+          "str qU42, [%x[outptr12], %x[mstride1]]\n"
+          "fmul U43.4s, scratch.4s, half.4s\n"
+          "fmls U43.4s, Ww42.4s, half.4s\n"
+          "str qU43, [%x[outptr12], %x[mstride2]]\n"
+          "add %x[outptr12], %x[outptr12], #0x10\n"
+
+          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
+          "bne 1b\n"
+
+        // Tail size 2
+        "2:"
+          // Load tile of the kernel
+          "ldr dw_11, [%x[inptr0]]\n"
+          "str dU11, [%x[outptr0]]\n"
+          "ldr dw_12, [%x[inptr0], %x[colstride1]]\n"
+          "ldr dw_13, [%x[inptr0], %x[colstride2]]\n"
+          "str dU14, [%x[outptr0], %x[mstride3]]\n"
+          "add %x[inptr0], %x[inptr0], #0x08\n"
+
+          "ldr dw_21, [%x[inptr1]]\n"
+          "ldr dw_22, [%x[inptr1], %x[colstride1]]\n"
+          "ldr dw_23, [%x[inptr1], %x[colstride2]]\n"
+          "add %x[inptr1], %x[inptr1], #0x08\n"
+
+          "ldr dw_31, [%x[inptr2]]\n"
+          "str dU41, [%x[outptr12]]\n"
+          "ldr dw_32, [%x[inptr2], %x[colstride1]]\n"
+          "ldr dw_33, [%x[inptr2], %x[colstride2]]\n"
+          "str dU44, [%x[outptr12], %x[mstride3]]\n"
+          "add %x[inptr2], %x[inptr2], #0x08\n"
+
+          // Compute 2nd and 3rd rows of Ww
+          "fadd scratch.2s, w_11.2s, w_31.2s\n"
+          "fmul Ww21.2s, scratch.2s, half.2s\n"
+          "fmla Ww21.2s, w_21.2s, half.2s\n"
+          "str dU21, [%x[outptr4]]\n"
+          "fmul Ww31.2s, scratch.2s, half.2s\n"
+          "fmls Ww31.2s, w_21.2s, half.2s\n"
+          "str dU31, [%x[outptr8]]\n"
+
+          "fadd scratch.2s, w_12.2s, w_32.2s\n"
+          "fmul Ww22.2s, scratch.2s, half.2s\n"
+          "fmla Ww22.2s, w_22.2s, half.2s\n"
+          "fmul Ww32.2s, scratch.2s, half.2s\n"
+          "fmls Ww32.2s, w_22.2s, half.2s\n"
+
+          "fadd scratch.2s, w_13.2s, w_33.2s\n"
+          "fmul Ww23.2s, scratch.2s, half.2s\n"
+          "fmla Ww23.2s, w_23.2s, half.2s\n"
+          "str dU24, [%x[outptr4], %x[mstride3]]\n"
+          "fmul Ww33.2s, scratch.2s, half.2s\n"
+          "fmls Ww33.2s, w_23.2s, half.2s\n"
+          "str dU34, [%x[outptr8], %x[mstride3]]\n"
+
+          // Compute and store U, only need to compute the 2nd and 3rd columns of
+          // U and update output pointers
+          "fadd scratch.2s, Ww11.2s, Ww13.2s\n"
+          "fmul U12.2s, scratch.2s, half.2s\n"
+          "fmla U12.2s, Ww12.2s, half.2s\n"
+          "str dU12, [%x[outptr0], %x[mstride1]]\n"
+          "fmul U13.2s, scratch.2s, half.2s\n"
+          "fmls U13.2s, Ww12.2s, half.2s\n"
+          "str dU13, [%x[outptr0], %x[mstride2]]\n"
+          "add  %x[outptr0],  %x[outptr0], #0x08\n"
+
+          "fadd scratch.2s, Ww21.2s, Ww23.2s\n"
+          "fmul U22.2s, scratch.2s, half.2s\n"
+          "fmla U22.2s, Ww22.2s, half.2s\n"
+          "str dU22, [%x[outptr4], %x[mstride1]]\n"
+          "fmul U23.2s, scratch.2s, half.2s\n"
+          "fmls U23.2s, Ww22.2s, half.2s\n"
+          "str dU23, [%x[outptr4], %x[mstride2]]\n"
+          "add  %x[outptr4],  %x[outptr4], #0x08\n"
+
+          "fadd scratch.2s, Ww31.2s, Ww33.2s\n"
+          "fmul U32.2s, scratch.2s, half.2s\n"
+          "fmla U32.2s, Ww32.2s, half.2s\n"
+          "str dU32, [%x[outptr8], %x[mstride1]]\n"
+          "fmul U33.2s, scratch.2s, half.2s\n"
+          "fmls U33.2s, Ww32.2s, half.2s\n"
+          "str dU33, [%x[outptr8], %x[mstride2]]\n"
+          "add  %x[outptr8],  %x[outptr8], #0x08\n"
+
+          "fadd scratch.2s, Ww41.2s, Ww43.2s\n"
+          "fmul U42.2s, scratch.2s, half.2s\n"
+          "fmla U42.2s, Ww42.2s, half.2s\n"
+          "str dU42, [%x[outptr12], %x[mstride1]]\n"
+          "fmul U43.2s, scratch.2s, half.2s\n"
+          "fmls U43.2s, Ww42.2s, half.2s\n"
+          "str dU43, [%x[outptr12], %x[mstride2]]\n"
+          "add %x[outptr12], %x[outptr12], #0x08\n"
+
+        // Clear aliases
+        ".unreq half\n"
+        ".unreq scratch\n"
+        ".unreq w_11\n"  ".unreq qw_11\n" ".unreq dw_11\n"
+        ".unreq w_12\n"  ".unreq qw_12\n" ".unreq dw_12\n"
+        ".unreq w_13\n"  ".unreq qw_13\n" ".unreq dw_13\n"
+        ".unreq w_21\n"  ".unreq qw_21\n" ".unreq dw_21\n"
+        ".unreq w_22\n"  ".unreq qw_22\n" ".unreq dw_22\n"
+        ".unreq w_23\n"  ".unreq qw_23\n" ".unreq dw_23\n"
+        ".unreq w_31\n"  ".unreq qw_31\n" ".unreq dw_31\n"
+        ".unreq w_32\n"  ".unreq qw_32\n" ".unreq dw_32\n"
+        ".unreq w_33\n"  ".unreq qw_33\n" ".unreq dw_33\n"
+        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
+        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
+        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
+        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
+        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
+        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
+        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
+        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
+        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
+        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
+        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
+        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
+        ".unreq dU11\n"  ".unreq dU12\n"  ".unreq dU13\n"  ".unreq dU14\n"
+        ".unreq dU21\n"  ".unreq dU22\n"  ".unreq dU23\n"  ".unreq dU24\n"
+        ".unreq dU31\n"  ".unreq dU32\n"  ".unreq dU33\n"  ".unreq dU34\n"
+        ".unreq dU41\n"  ".unreq dU42\n"  ".unreq dU43\n"  ".unreq dU44\n"
+
+      : [inptr0] "+r" (inptr0),
+        [inptr1] "+r" (inptr1),
+        [inptr2] "+r" (inptr2),
+        [outptr0] "+r" (outptr0),
+        [outptr4] "+r" (outptr4),
+        [outptr8] "+r" (outptr8),
+        [outptr12] "+r" (outptr12),
+        [n_remaining_channels] "+r" (n_remaining_channels)
+      : [mstride1] "r" (sizeof(float) * mstride),
+        [mstride2] "r" (sizeof(float) * mstride * 2),
+        [mstride3] "r" (sizeof(float) * mstride * 3),
+        [colstride1] "r" (sizeof(float) * kernel_col_stride),
+        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
+        [one_half] "r" (0.5f)
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24"
+    );
+
+    // Progression to complete stride
+    outptr0 += matrix_row_stride - n_output_channels;
+    outptr4 += matrix_row_stride - n_output_channels;
+    outptr8 += matrix_row_stride - n_output_channels;
+    outptr12 += matrix_row_stride - n_output_channels;
+  }
+}
+
+template <>
+template <>
+inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<1>(
+    const float* const kernel,
+    const int n_input_channels,
+    const int n_output_channels,
+    float* const matrix_base,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  // Use one input pointer for each row of the kernel, use two additional
+  // offsets to extract columns.
+  const int kernel_col_stride = n_input_channels * n_output_channels;
+  const int kernel_row_stride = 3 * kernel_col_stride;
+  const float *inptr0 = kernel;
+  const float *inptr1 = kernel + kernel_row_stride;
+  const float *inptr2 = kernel + kernel_row_stride*2;
+
+  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
+  // offsets to extract further matrices.
+  float  *outptr0 = matrix_base;
+  float  *outptr4 = matrix_base + mstride * 4;
+  float  *outptr8 = matrix_base + mstride * 8;
+  float *outptr12 = matrix_base + mstride * 12;
+
+  // For every input channel
+  for (int in_c = 0; in_c < n_input_channels; in_c++) {
+    int n_remaining_channels = n_output_channels;
+
+    asm volatile (
+        // Registers into which to read the kernel
+        "w_11 .req v0\n"  "qw_11 .req q0\n"  "sw_11 .req s0\n"
+        "w_12 .req v1\n"  "qw_12 .req q1\n"  "sw_12 .req s1\n"
+        "w_13 .req v2\n"  "qw_13 .req q2\n"  "sw_13 .req s2\n"
+        "w_21 .req v3\n"  "qw_21 .req q3\n"  "sw_21 .req s3\n"
+        "w_22 .req v4\n"  "qw_22 .req q4\n"  "sw_22 .req s4\n"
+        "w_23 .req v5\n"  "qw_23 .req q5\n"  "sw_23 .req s5\n"
+        "w_31 .req v6\n"  "qw_31 .req q6\n"  "sw_31 .req s6\n"
+        "w_32 .req v7\n"  "qw_32 .req q7\n"  "sw_32 .req s7\n"
+        "w_33 .req v8\n"  "qw_33 .req q8\n"  "sw_33 .req s8\n"
+
+        // Transformed matrix Ww
+        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
+        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
+        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
+        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
+
+        // Output matrix U = WwWT
+        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
+        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
+        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
+        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
+
+        // Storage view of output matrices
+        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
+        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
+        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
+        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
+
+        "sU11 .req   s0\n"   "sU12 .req s15\n"  "sU13 .req s16\n"  "sU14 .req   s2\n"
+        "sU21 .req   s9\n"   "sU22 .req s17\n"  "sU23 .req s18\n"  "sU24 .req  s11\n"
+        "sU31 .req  s12\n"   "sU32 .req s19\n"  "sU33 .req s20\n"  "sU34 .req  s14\n"
+        "sU41 .req   s6\n"   "sU42 .req s21\n"  "sU43 .req s22\n"  "sU44 .req   s8\n"
+
+        "half .req v23\n"  // {0.5, ..., 0.5}
+        "dup half.4s, %w[one_half]\n"
+        "scratch .req v24\n"
+        
+        // Subtract the tail from the number of remaining channels and jump to
+        // the tail if necessary.
+        "subs %x[n_remaining_channels], %x[n_remaining_channels], #1\n"
+        "beq 2f\n"
+
+        "1:"
+          // Load tile of the kernel
+          "ldr qw_11, [%x[inptr0]]\n"
+          "str qU11, [%x[outptr0]]\n"
+          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
+          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
+          "str qU14, [%x[outptr0], %x[mstride3]]\n"
+          "add %x[inptr0], %x[inptr0], #0x10\n"
+
+          "ldr qw_21, [%x[inptr1]]\n"
+          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
+          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
+          "add %x[inptr1], %x[inptr1], #0x10\n"
+
+          "ldr qw_31, [%x[inptr2]]\n"
+          "str qU41, [%x[outptr12]]\n"
+          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
+          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
+          "str qU44, [%x[outptr12], %x[mstride3]]\n"
+          "add %x[inptr2], %x[inptr2], #0x10\n"
+
+          // Compute 2nd and 3rd rows of Ww
+          "fadd scratch.4s, w_11.4s, w_31.4s\n"
+          "fmul Ww21.4s, scratch.4s, half.4s\n"
+          "fmla Ww21.4s, w_21.4s, half.4s\n"
+          "str qU21, [%x[outptr4]]\n"
+          "fmul Ww31.4s, scratch.4s, half.4s\n"
+          "fmls Ww31.4s, w_21.4s, half.4s\n"
+          "str qU31, [%x[outptr8]]\n"
+
+          "fadd scratch.4s, w_12.4s, w_32.4s\n"
+          "fmul Ww22.4s, scratch.4s, half.4s\n"
+          "fmla Ww22.4s, w_22.4s, half.4s\n"
+          "fmul Ww32.4s, scratch.4s, half.4s\n"
+          "fmls Ww32.4s, w_22.4s, half.4s\n"
+
+          "fadd scratch.4s, w_13.4s, w_33.4s\n"
+          "fmul Ww23.4s, scratch.4s, half.4s\n"
+          "fmla Ww23.4s, w_23.4s, half.4s\n"
+          "str qU24, [%x[outptr4], %x[mstride3]]\n"
+          "fmul Ww33.4s, scratch.4s, half.4s\n"
+          "fmls Ww33.4s, w_23.4s, half.4s\n"
+          "str qU34, [%x[outptr8], %x[mstride3]]\n"
+
+          // Compute and store U, only need to compute the 2nd and 3rd columns
+          // of U and update output pointers
+          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
+          "fmul U12.4s, scratch.4s, half.4s\n"
+          "fmla U12.4s, Ww12.4s, half.4s\n"
+          "str qU12, [%x[outptr0], %x[mstride1]]\n"
+          "fmul U13.4s, scratch.4s, half.4s\n"
+          "fmls U13.4s, Ww12.4s, half.4s\n"
+          "str qU13, [%x[outptr0], %x[mstride2]]\n"
+          "add  %x[outptr0],  %x[outptr0], #0x10\n"
+
+          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
+          "fmul U22.4s, scratch.4s, half.4s\n"
+          "fmla U22.4s, Ww22.4s, half.4s\n"
+          "str qU22, [%x[outptr4], %x[mstride1]]\n"
+          "fmul U23.4s, scratch.4s, half.4s\n"
+          "fmls U23.4s, Ww22.4s, half.4s\n"
+          "str qU23, [%x[outptr4], %x[mstride2]]\n"
+          "add  %x[outptr4],  %x[outptr4], #0x10\n"
+
+          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
+          "fmul U32.4s, scratch.4s, half.4s\n"
+          "fmla U32.4s, Ww32.4s, half.4s\n"
+          "str qU32, [%x[outptr8], %x[mstride1]]\n"
+          "fmul U33.4s, scratch.4s, half.4s\n"
+          "fmls U33.4s, Ww32.4s, half.4s\n"
+          "str qU33, [%x[outptr8], %x[mstride2]]\n"
+          "add  %x[outptr8],  %x[outptr8], #0x10\n"
+
+          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
+          "fmul U42.4s, scratch.4s, half.4s\n"
+          "fmla U42.4s, Ww42.4s, half.4s\n"
+          "str qU42, [%x[outptr12], %x[mstride1]]\n"
+          "fmul U43.4s, scratch.4s, half.4s\n"
+          "fmls U43.4s, Ww42.4s, half.4s\n"
+          "str qU43, [%x[outptr12], %x[mstride2]]\n"
+          "add %x[outptr12], %x[outptr12], #0x10\n"
+
+          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
+          "bne 1b\n"
+
+        // Tail size 1
+        "2:"
+          // Load tile of the kernel
+          "ldr sw_11, [%x[inptr0]]\n"
+          "str sU11, [%x[outptr0]]\n"
+          "ldr sw_12, [%x[inptr0], %x[colstride1]]\n"
+          "ldr sw_13, [%x[inptr0], %x[colstride2]]\n"
+          "str sU14, [%x[outptr0], %x[mstride3]]\n"
+          "add %x[inptr0], %x[inptr0], #0x04\n"
+
+          "ldr sw_21, [%x[inptr1]]\n"
+          "ldr sw_22, [%x[inptr1], %x[colstride1]]\n"
+          "ldr sw_23, [%x[inptr1], %x[colstride2]]\n"
+          "add %x[inptr1], %x[inptr1], #0x04\n"
+
+          "ldr sw_31, [%x[inptr2]]\n"
+          "str sU41, [%x[outptr12]]\n"
+          "ldr sw_32, [%x[inptr2], %x[colstride1]]\n"
+          "ldr sw_33, [%x[inptr2], %x[colstride2]]\n"
+          "str sU44, [%x[outptr12], %x[mstride3]]\n"
+          "add %x[inptr2], %x[inptr2], #0x04\n"
+
+          // Compute 2nd and 3rd rows of Ww
+          "fadd scratch.2s, w_11.2s, w_31.2s\n"
+          "fmul Ww21.2s, scratch.2s, half.2s\n"
+          "fmla Ww21.2s, w_21.2s, half.2s\n"
+          "str sU21, [%x[outptr4]]\n"
+          "fmul Ww31.2s, scratch.2s, half.2s\n"
+          "fmls Ww31.2s, w_21.2s, half.2s\n"
+          "str sU31, [%x[outptr8]]\n"
+
+          "fadd scratch.2s, w_12.2s, w_32.2s\n"
+          "fmul Ww22.2s, scratch.2s, half.2s\n"
+          "fmla Ww22.2s, w_22.2s, half.2s\n"
+          "fmul Ww32.2s, scratch.2s, half.2s\n"
+          "fmls Ww32.2s, w_22.2s, half.2s\n"
+
+          "fadd scratch.2s, w_13.2s, w_33.2s\n"
+          "fmul Ww23.2s, scratch.2s, half.2s\n"
+          "fmla Ww23.2s, w_23.2s, half.2s\n"
+          "str sU24, [%x[outptr4], %x[mstride3]]\n"
+          "fmul Ww33.2s, scratch.2s, half.2s\n"
+          "fmls Ww33.2s, w_23.2s, half.2s\n"
+          "str sU34, [%x[outptr8], %x[mstride3]]\n"
+
+          // Compute and store U, only need to compute the 2nd and 3rd columns of
+          // U and update output pointers
+          "fadd scratch.2s, Ww11.2s, Ww13.2s\n"
+          "fmul U12.2s, scratch.2s, half.2s\n"
+          "fmla U12.2s, Ww12.2s, half.2s\n"
+          "str sU12, [%x[outptr0], %x[mstride1]]\n"
+          "fmul U13.2s, scratch.2s, half.2s\n"
+          "fmls U13.2s, Ww12.2s, half.2s\n"
+          "str sU13, [%x[outptr0], %x[mstride2]]\n"
+          "add  %x[outptr0],  %x[outptr0], #0x04\n"
+
+          "fadd scratch.2s, Ww21.2s, Ww23.2s\n"
+          "fmul U22.2s, scratch.2s, half.2s\n"
+          "fmla U22.2s, Ww22.2s, half.2s\n"
+          "str sU22, [%x[outptr4], %x[mstride1]]\n"
+          "fmul U23.2s, scratch.2s, half.2s\n"
+          "fmls U23.2s, Ww22.2s, half.2s\n"
+          "str sU23, [%x[outptr4], %x[mstride2]]\n"
+          "add  %x[outptr4],  %x[outptr4], #0x04\n"
+
+          "fadd scratch.2s, Ww31.2s, Ww33.2s\n"
+          "fmul U32.2s, scratch.2s, half.2s\n"
+          "fmla U32.2s, Ww32.2s, half.2s\n"
+          "str sU32, [%x[outptr8], %x[mstride1]]\n"
+          "fmul U33.2s, scratch.2s, half.2s\n"
+          "fmls U33.2s, Ww32.2s, half.2s\n"
+          "str sU33, [%x[outptr8], %x[mstride2]]\n"
+          "add  %x[outptr8],  %x[outptr8], #0x04\n"
+
+          "fadd scratch.2s, Ww41.2s, Ww43.2s\n"
+          "fmul U42.2s, scratch.2s, half.2s\n"
+          "fmla U42.2s, Ww42.2s, half.2s\n"
+          "str sU42, [%x[outptr12], %x[mstride1]]\n"
+          "fmul U43.2s, scratch.2s, half.2s\n"
+          "fmls U43.2s, Ww42.2s, half.2s\n"
+          "str sU43, [%x[outptr12], %x[mstride2]]\n"
+          "add %x[outptr12], %x[outptr12], #0x04\n"
+
+        // Clear aliases
+        ".unreq half\n"
+        ".unreq scratch\n"
+        ".unreq w_11\n"  ".unreq qw_11\n" ".unreq sw_11\n"
+        ".unreq w_12\n"  ".unreq qw_12\n" ".unreq sw_12\n"
+        ".unreq w_13\n"  ".unreq qw_13\n" ".unreq sw_13\n"
+        ".unreq w_21\n"  ".unreq qw_21\n" ".unreq sw_21\n"
+        ".unreq w_22\n"  ".unreq qw_22\n" ".unreq sw_22\n"
+        ".unreq w_23\n"  ".unreq qw_23\n" ".unreq sw_23\n"
+        ".unreq w_31\n"  ".unreq qw_31\n" ".unreq sw_31\n"
+        ".unreq w_32\n"  ".unreq qw_32\n" ".unreq sw_32\n"
+        ".unreq w_33\n"  ".unreq qw_33\n" ".unreq sw_33\n"
+        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
+        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
+        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
+        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
+        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
+        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
+        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
+        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
+        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
+        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
+        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
+        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
+        ".unreq sU11\n"  ".unreq sU12\n"  ".unreq sU13\n"  ".unreq sU14\n"
+        ".unreq sU21\n"  ".unreq sU22\n"  ".unreq sU23\n"  ".unreq sU24\n"
+        ".unreq sU31\n"  ".unreq sU32\n"  ".unreq sU33\n"  ".unreq sU34\n"
+        ".unreq sU41\n"  ".unreq sU42\n"  ".unreq sU43\n"  ".unreq sU44\n"
+
+      : [inptr0] "+r" (inptr0),
+        [inptr1] "+r" (inptr1),
+        [inptr2] "+r" (inptr2),
+        [outptr0] "+r" (outptr0),
+        [outptr4] "+r" (outptr4),
+        [outptr8] "+r" (outptr8),
+        [outptr12] "+r" (outptr12),
+        [n_remaining_channels] "+r" (n_remaining_channels)
+      : [mstride1] "r" (sizeof(float) * mstride),
+        [mstride2] "r" (sizeof(float) * mstride * 2),
+        [mstride3] "r" (sizeof(float) * mstride * 3),
+        [colstride1] "r" (sizeof(float) * kernel_col_stride),
+        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
+        [one_half] "r" (0.5f)
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+        "v20", "v21", "v22", "v23", "v24"
+    );
+
+    // Progression to complete stride
+    outptr0 += matrix_row_stride - n_output_channels;
+    outptr4 += matrix_row_stride - n_output_channels;
+    outptr8 += matrix_row_stride - n_output_channels;
+    outptr12 += matrix_row_stride - n_output_channels;
+  }
+}
+}
+#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp
new file mode 100644
index 0000000..0992c0b
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp

@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+namespace winograd {
+  /* Transform from the Winograd domain back to the spatial domain.
+   */
+  template <typename T>
+  struct Winograd2x2_3x3GemmOutput {
+    static void execute(
+      const Tensor4DShape &output_shape,
+      T* const matrix_base,
+      const int matrix_stride,
+      const int matrix_row_stride,
+      T* const output
+    );
+
+    protected:
+    /* Specialised implementation method. */
+    template <bool tail_M, bool tail_N, int channel_tail>
+    static void _execute(
+      const Tensor4DShape &output_shape,
+      T *output,
+      const T *input,
+      const int matrix_stride,
+      const int matrix_row_stride
+    );
+  };
+
+  /* Two-stage implementation of the transformation from the Winograd domain.
+   *
+   * First computes Z.F and then computes (Z.F).Z^T.
+   */
+  template <typename T>
+  struct Winograd2x2_3x3GemmOutput_TwoStage {
+    static void execute(
+      const Tensor4DShape &output_shape,
+      T* const matrix_base,
+      const int matrix_stride,
+      const int matrix_row_stride,
+      T* const output
+    );
+
+    protected:
+    template <int channel_tail>
+    static void compute_zf(
+      const int n_rows, const int n_channels,
+      T* const zf, const T* const input[16]
+    );
+
+    template <bool tail_M, bool tail_N, int channel_tail>
+    static void compute_zfzT(
+      const Tensor4DShape &output_shape,
+      T* const output, const T* const zf
+    );
+  };
+}
+
+#include "output_2x2_3x3/a64_float.hpp"
+// #include "output_2x2_3x3/a64_float_two_stage.hpp"
+
+/*****************************************************************************/
+/*
+template <typename T>
+void winograd::Winograd2x2_3x3GemmOutput<T>::execute(
+    const Tensor4DShape &output_shape,
+    const int tile_M,
+    const int tile_N,
+    T* const matrix_base,
+    const int matrix_stride,
+    const int matrix_row_stride,
+    T* const output
+) {
+  T* const antipadding = reinterpret_cast<T *>(malloc(sizeof(T) * output_shape.n_channels));
+
+  // Get input pointers
+  const T* inptrs[16];
+  for (int i = 0; i < 16; i++) {
+    inptrs[i] = matrices[i];
+  }
+
+  for (int batch = 0; batch < output_shape.n_batches; batch++) {
+    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        // Get pointers for each of the 4 output cells required for this computation
+        T* outptrs[4];
+        for (int cell_i = 0, c = 0; cell_i < 2; cell_i++) {
+          for (int cell_j = 0; cell_j < 2; cell_j++, c++) {
+            const int i = tile_i*2 + cell_i;
+            const int j = tile_j*2 + cell_j;
+
+            if (i < output_shape.n_rows && j < output_shape.n_cols) {
+              outptrs[c] = output + (
+                  (batch*output_shape.n_rows + i) * output_shape.n_cols +
+                j) * output_shape.n_channels;
+            } else {
+              outptrs[c] = antipadding;
+            }
+          }  // cell_j
+        }  // cell_i
+
+        for (int n = 0; n < output_shape.n_channels; n++) {
+          // Read 16 values and progress pointers
+          T v[16];
+          for (int i = 0; i < 16; i++) {
+            v[i] = *(inptrs[i]++);
+          }
+
+          // Compute output for 4 pixels
+          *(outptrs[0]++) = v[ 0] + v[ 1] + v[ 2] +
+                            v[ 4] + v[ 5] + v[ 6] +
+                            v[ 8] + v[ 9] + v[10];
+          *(outptrs[1]++) = v[ 1] - v[ 2] - v[ 3] +
+                            v[ 5] - v[ 6] - v[ 7] +
+                            v[ 9] - v[10] - v[11];
+          *(outptrs[2]++) = v[ 4] + v[ 5] + v[ 6] -
+                            v[ 8] - v[ 9] - v[10] -
+                            v[12] - v[13] - v[14];
+          *(outptrs[3]++) = v[ 5] - v[ 6] - v[ 7] -
+                            v[ 9] + v[10] + v[11] -
+                            v[13] + v[14] + v[15];
+        }  // output_channel
+      }  // tile_j
+    }  // tile_i
+  }  // batch
+
+  free(antipadding);
+}
+*/
+
+/*****************************************************************************/
+/*
+template <typename T>
+void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::execute(
+    const Tensor4DShape &output_shape,
+    T* const matrices[16], T* const output
+) {
+  // Allocate memory for the intermediate matrices
+  const int tile_M = iceildiv(output_shape.n_rows, 2);
+  const int tile_N = iceildiv(output_shape.n_cols, 2);
+  const int n_rows = output_shape.n_batches * tile_M * tile_N;
+  const int n_channels = output_shape.n_channels;
+  T* matrices_zf = reinterpret_cast<T*>(
+    calloc(8 * n_rows * n_channels, sizeof(T))
+  );
+  
+  // Perform the first stage transform, computing ZF.
+  // Specializations should dispatch to different methods based on tail size.
+  compute_zf<0>(n_rows, n_channels, matrices_zf, matrices);
+  
+  // Perform the second stage transform, finishing Z F Z^T - variable dispatch
+  // based on size of the output. Specialisations can also dispatch based on
+  // the tail-size of the channel.
+  if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
+    compute_zfzT<true, true, 0>(output_shape, output, matrices_zf);
+  } else if (output_shape.n_rows % 2) {
+    compute_zfzT<true, false, 0>(output_shape, output, matrices_zf);
+  } else if (output_shape.n_cols % 2) {
+    compute_zfzT<false, true, 0>(output_shape, output, matrices_zf);
+  } else {
+    compute_zfzT<false, false, 0>(output_shape, output, matrices_zf);
+  }
+
+  free(reinterpret_cast<void*>(matrices_zf));
+}
+
+template <typename T>
+template <int channel_tail>
+void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::compute_zf(
+    const int n_rows, const int n_channels,
+    T* output, const T* const input[16]
+) {
+  // Extract 8 output pointers
+  T* outptr[8];
+  for (int i = 0; i < 8; i++) {
+    outptr[i] = output + i*n_rows*n_channels;
+  }
+
+  // Copy the 16 input pointers
+  const T* inptr[16];
+  for (int i = 0; i < 16; i++) {
+    inptr[i] = input[i];
+  }
+
+  // For every row of the matrices
+  for (int i = 0; i < n_rows; i++) {
+    // For every channel
+    for (int j = 0; j < n_channels; j++) {
+      // Extract values from the input matrices
+      T val[16];
+      for (int n = 0; n < 16; n++) {
+        val[n] = *(inptr[n]++);
+      }
+
+      // Compute output values
+      *(outptr[0]++) = val[0] + val[1] + val[2];
+      *(outptr[1]++) = val[1] - val[2] - val[3];
+      *(outptr[2]++) = val[4] + val[5] + val[6];
+      *(outptr[3]++) = val[5] - val[6] - val[7];
+      *(outptr[4]++) = val[8] + val[9] + val[10];
+      *(outptr[5]++) = val[9] - val[10] - val[11];
+      *(outptr[6]++) = val[12] + val[13] + val[14];
+      *(outptr[7]++) = val[13] - val[14] - val[15];
+    }
+  }
+}
+
+template <typename T>
+template <bool tail_M, bool tail_N, int channel_tail>
+void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::compute_zfzT(
+    const Tensor4DShape &output_shape,
+    T* const output, const T* const input
+) {
+  // Sizing information
+  const int tile_M = output_shape.n_rows / 2;
+  const int tile_N = output_shape.n_cols / 2;
+
+  const int n_rows = (output_shape.n_batches *
+                      (tile_M + (tail_M ? 1 : 0)) *
+                      (tile_N + (tail_N ? 1 : 0)));
+  const int n_channels = output_shape.n_channels;
+
+  // Extract 8 input pointers
+  const T* inptr[8];
+  for (int i = 0; i < 8; i++) {
+    inptr[i] = input + i*n_rows*n_channels;
+  }
+
+  // Extract 4 output pointers
+  T* outptr00 = output;
+  T* outptr01 = outptr00 + n_channels;
+  T* outptr10 = outptr00 + output_shape.n_cols * n_channels;
+  T* outptr11 = outptr10 + n_channels;
+
+  // Progress over the output tiles, generating output values.
+  for (int batch = 0; batch < output_shape.n_batches; batch++) {
+    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          T v[8];
+          for (int i = 0; i < 8; i++) {
+            v[i] = *(inptr[i]++);
+          }
+
+          // Compute the output values and progress the output pointers.
+          *(outptr00++) = v[0] + v[2] + v[4];
+          *(outptr01++) = v[1] + v[3] + v[5];
+          *(outptr10++) = v[2] - v[4] - v[6];
+          *(outptr11++) = v[3] - v[5] - v[7];
+        }
+
+        // Progress the output pointers to the next column
+        outptr00 += n_channels;
+        outptr01 += n_channels;
+        outptr10 += n_channels;
+        outptr11 += n_channels;
+      }
+
+      if (tail_N) {
+        // Only evaluate the left-most columns of the output
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          T v[8];
+          for (int i = 0; i < 4; i++) {
+            v[i * 2] = *inptr[i * 2];
+          }
+          for (int i = 0; i < 8; i++) {
+            inptr[i]++;
+          }
+
+          // Compute the output values and progress the output pointers.
+          *(outptr00++) = v[0] + v[2] + v[4];
+          *(outptr10++) = v[2] - v[4] - v[6];
+        }
+
+        // Progress the output pointers to the next column
+        outptr01 += n_channels;  // Account for being skipped above
+        outptr11 += n_channels;  // Account for being skipped above
+      }
+
+      // Progress the output pointers to the next row
+      outptr00 += output_shape.n_cols * n_channels;
+      outptr01 += output_shape.n_cols * n_channels;
+      outptr10 += output_shape.n_cols * n_channels;
+      outptr11 += output_shape.n_cols * n_channels;
+    }
+
+    if (tail_M) {
+      // Only work on the upper row of the output
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          T v[8];
+          for (int i = 0; i < 8; i++) {
+            v[i] = *(inptr[i]++);
+          }
+
+          // Compute the output values and progress the output pointers.
+          *(outptr00++) = v[0] + v[2] + v[4];
+          *(outptr01++) = v[1] + v[3] + v[5];
+        }
+
+        // Progress the output pointers to the next column
+        outptr00 += n_channels;
+        outptr01 += n_channels;
+        outptr10 += 2 * n_channels;  // Account for being skipped above
+        outptr11 += 2 * n_channels;  // Account for being skipped above
+      }
+
+      if (tail_N) {
+        // Only evaluate the upper-left cell of the output
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          T v[8];
+          for (int i = 0; i < 3; i++) {
+            v[i * 2] = *inptr[i * 2];
+          }
+          for (int i = 0; i < 8; i++) {
+            inptr[i]++;
+          }
+
+          // Compute the output values and progress the output pointers.
+          *(outptr00++) = v[0] + v[2] + v[4];
+        }
+
+        // Progress the output pointers to the next column
+        outptr01 += n_channels;  // Account for being skipped above
+        outptr10 += n_channels;  // Account for being skipped above
+        outptr11 += n_channels;  // Account for being skipped above
+      }
+    }
+  }
+}
+*/

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp
new file mode 100644
index 0000000..bf6ba90
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp

@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+/* Float implementation for AArch64.
+ */
+#ifdef __aarch64__
+namespace winograd {
+
+
+template <>
+template <>
+inline void Winograd2x2_3x3GemmOutput<float>::_execute<false, false, 0>(
+    const Tensor4DShape &output_shape,
+    float *output,
+    const float *input,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  const int tile_M = output_shape.n_rows / 2;
+  const int tile_N = output_shape.n_cols / 2;
+  int batch = output_shape.n_batches;
+  float *outptr = output;
+
+  const float *inptr0 = input;
+  const float *inptr4 = input + 4 * mstride;
+  const float *inptr8 = input + 8 * mstride;
+  const float *inptr12 = input + 12 * mstride;
+
+  const size_t col_stride = sizeof(float) * output_shape.n_channels;
+  const size_t row_stride = col_stride * tile_N * 2;
+
+  asm volatile (
+      // Aliases for elements of the input matrix `F`
+      // V-register      Q-register
+      "F11 .req  v0\n" "qF11 .req  q0\n"
+      "F12 .req  v1\n" "qF12 .req  q1\n"
+      "F13 .req  v2\n" "qF13 .req  q2\n"
+      "F14 .req  v3\n" "qF14 .req  q3\n"
+      "F21 .req  v4\n" "qF21 .req  q4\n"
+      "F22 .req  v5\n" "qF22 .req  q5\n"
+      "F23 .req  v6\n" "qF23 .req  q6\n"
+      "F24 .req  v7\n" "qF24 .req  q7\n"
+      "F31 .req  v8\n" "qF31 .req  q8\n"
+      "F32 .req  v9\n" "qF32 .req  q9\n"
+      "F33 .req v10\n" "qF33 .req q10\n"
+      "F34 .req v11\n" "qF34 .req q11\n"
+      "F41 .req v12\n" "qF41 .req q12\n"
+      "F42 .req v13\n" "qF42 .req q13\n"
+      "F43 .req v14\n" "qF43 .req q14\n"
+      "F44 .req v15\n" "qF44 .req q15\n"
+
+      // Aliases for elements of the intermediate matrix `FZ`
+      "FZ11 .req v16\n"
+      "FZ12 .req v17\n"
+      "FZ21 .req v18\n"
+      "FZ22 .req v19\n"
+      "FZ31 .req v20\n"
+      "FZ32 .req v21\n"
+      "FZ41 .req v22\n"
+      "FZ42 .req v23\n"
+
+      // Aliases for elements of the output matrix `f` (called `g` due to case
+      // insensitivity of aliases).
+      " g11 .req v24\n"
+      "qg11 .req q24\n"
+      " g12 .req v25\n"
+      "qg12 .req q25\n"
+      " g21 .req v26\n"
+      "qg21 .req q26\n"
+      " g22 .req v27\n"
+      "qg22 .req q27\n"
+
+      // Prepare the various strides
+      "col_stride .req %x[col_stride]\n"
+      "row_stride .req %x[row_stride]\n"
+      "row_plus_col_stride .req %x[row_plus_col_stride]\n"
+
+      "mstride1 .req %x[mstride1]\n"
+      "mstride2 .req %x[mstride2]\n"
+      "mstride3 .req %x[mstride3]\n"
+
+      "tile_i  .req x19\n"  // Tile row counter
+      "tile_j  .req x20\n"  // Tile column counter
+      "channel .req x21\n"  // Channel counter
+
+      "1:"  // Loop over batches
+        "mov tile_i, %x[tile_M]\n"  // Reset tile row counter
+
+        "2:"  // Loop over rows of tiles
+          "mov tile_j, %x[tile_N]\n"  // Reset tile column counter
+
+          "3:"  // Loop over columns of tiles
+            // Perform initial loads of the matrix `F`
+            "ldr qF11, [%x[inptr0]]\n"
+            "ldr qF12, [%x[inptr0], mstride1]\n"
+            "ldr qF13, [%x[inptr0], mstride2]\n"
+            "ldr qF14, [%x[inptr0], mstride3]\n"
+            "add %x[inptr0], %x[inptr0], #0x10\n"
+            "ldr qF21, [%x[inptr4]]\n"
+            "ldr qF22, [%x[inptr4], mstride1]\n"
+            "subs channel, %x[n_channels], #4\n"  // Reset channel counter
+
+            "ldr qF23, [%x[inptr4], mstride2]\n"
+            "ldr qF24, [%x[inptr4], mstride3]\n"
+            "add %x[inptr4], %x[inptr4], #0x10\n"
+            "beq 5f\n"  // Jump straight to tail if necessary
+
+            "4:"  // Loop over channels
+              "ldr qF31, [%x[inptr8]]\n"
+              "fadd FZ11.4s,  F11.4s, F12.4s\n"
+
+              "ldr qF32, [%x[inptr8], mstride1]\n"
+              "fsub FZ12.4s,  F12.4s, F13.4s\n"
+
+              "ldr qF33, [%x[inptr8], mstride2]\n"
+              "fadd FZ11.4s, FZ11.4s, F13.4s\n"
+
+              "ldr qF34, [%x[inptr8], mstride3]\n"
+              "fsub FZ12.4s, FZ12.4s, F14.4s\n"
+
+              "ldr qF41, [%x[inptr12]]\n"
+              "fadd FZ21.4s,  F21.4s, F22.4s\n"
+
+              "ldr qF42, [%x[inptr12], mstride1]\n"
+              "fsub FZ22.4s,  F22.4s, F23.4s\n"
+
+              "ldr qF43, [%x[inptr12], mstride2]\n"
+              "fadd FZ21.4s, FZ21.4s, F23.4s\n"
+
+              "ldr qF44, [%x[inptr12], mstride3]\n"
+              "fsub FZ22.4s, FZ22.4s, F24.4s\n"
+
+              "fadd FZ31.4s,  F31.4s, F32.4s\n"
+              "add %x[inptr8], %x[inptr8], #0x10\n"
+
+              "fsub FZ32.4s,  F32.4s, F33.4s\n"
+              "add %x[inptr12], %x[inptr12], #0x10\n"
+
+              "fadd FZ31.4s, FZ31.4s, F33.4s\n"
+
+              "fsub FZ32.4s, FZ32.4s, F34.4s\n"
+
+              "fadd g11.4s, FZ11.4s, FZ21.4s\n"
+
+              "fadd g12.4s, FZ12.4s, FZ22.4s\n"
+
+              "fadd g11.4s,  g11.4s, FZ31.4s\n"
+
+              "fadd g12.4s,  g12.4s, FZ32.4s\n"
+
+              "ldr qF11, [%x[inptr0]]\n"
+              "fadd FZ41.4s,  F41.4s, F42.4s\n"
+
+              "ldr qF12, [%x[inptr0], mstride1]\n"
+              "fsub g21.4s, FZ21.4s, FZ31.4s\n"
+
+              "ldr qF13, [%x[inptr0], mstride2]\n"
+              "fsub FZ42.4s,  F42.4s, F43.4s\n"
+
+              "ldr qF14, [%x[inptr0], mstride3]\n"
+              "str qg11, [%x[outptr]]\n"
+
+              "ldr qF21, [%x[inptr4]]\n"
+              "fadd FZ41.4s, FZ41.4s, F43.4s\n"
+
+              "ldr qF22, [%x[inptr4], mstride1]\n"
+              "str qg12, [%x[outptr], col_stride]\n"
+
+              "ldr qF23, [%x[inptr4], mstride2]\n"
+              "fsub FZ42.4s, FZ42.4s, F44.4s\n"
+
+              "ldr qF24, [%x[inptr4], mstride3]\n"
+              "fsub g22.4s, FZ22.4s, FZ32.4s\n"
+
+              "fsub g21.4s,  g21.4s, FZ41.4s\n"
+              "add %x[inptr0], %x[inptr0], #0x10\n"
+
+              "fsub g22.4s,  g22.4s, FZ42.4s\n"
+              "add %x[inptr4], %x[inptr4], #0x10\n"
+
+              "subs channel, channel, #4\n"
+
+              "str qg21, [%x[outptr], row_stride]\n"
+
+              "str qg22, [%x[outptr], row_plus_col_stride]\n"
+
+              "add %x[outptr], %x[outptr], #0x10\n"
+
+              "bne 4b\n"
+
+            "5:"  // Channel tail
+              "ldr qF31, [%x[inptr8]]\n"
+              "fadd FZ11.4s,  F11.4s, F12.4s\n"
+
+              "ldr qF32, [%x[inptr8], mstride1]\n"
+              "fsub FZ12.4s,  F12.4s, F13.4s\n"
+
+              "ldr qF33, [%x[inptr8], mstride2]\n"
+              "fadd FZ11.4s, FZ11.4s, F13.4s\n"
+
+              "ldr qF34, [%x[inptr8], mstride3]\n"
+              "fsub FZ12.4s, FZ12.4s, F14.4s\n"
+
+              "ldr qF41, [%x[inptr12]]\n"
+              "fadd FZ21.4s,  F21.4s, F22.4s\n"
+
+              "ldr qF42, [%x[inptr12], mstride1]\n"
+              "fsub FZ22.4s,  F22.4s, F23.4s\n"
+
+              "ldr qF43, [%x[inptr12], mstride2]\n"
+              "fadd FZ21.4s, FZ21.4s, F23.4s\n"
+
+              "ldr qF44, [%x[inptr12], mstride3]\n"
+              "fsub FZ22.4s, FZ22.4s, F24.4s\n"
+
+              "fadd FZ31.4s,  F31.4s, F32.4s\n"
+              "add %x[inptr8], %x[inptr8], #0x10\n"
+
+              "fsub FZ32.4s,  F32.4s, F33.4s\n"
+              "add %x[inptr12], %x[inptr12], #0x10\n"
+
+              "fadd FZ31.4s, FZ31.4s, F33.4s\n"
+
+              "fsub FZ32.4s, FZ32.4s, F34.4s\n"
+
+              "fadd g11.4s, FZ11.4s, FZ21.4s\n"
+
+              "fadd g12.4s, FZ12.4s, FZ22.4s\n"
+
+              "fadd g11.4s,  g11.4s, FZ31.4s\n"
+
+              "fadd g12.4s,  g12.4s, FZ32.4s\n"
+
+              "fadd FZ41.4s,  F41.4s, F42.4s\n"
+
+              "fsub g21.4s, FZ21.4s, FZ31.4s\n"
+
+              "fsub FZ42.4s,  F42.4s, F43.4s\n"
+
+              "str qg11, [%x[outptr]]\n"
+
+              "fadd FZ41.4s, FZ41.4s, F43.4s\n"
+
+              "str qg12, [%x[outptr], col_stride]\n"
+
+              "fsub FZ42.4s, FZ42.4s, F44.4s\n"
+
+              "fsub g22.4s, FZ22.4s, FZ32.4s\n"
+
+              "fsub g21.4s,  g21.4s, FZ41.4s\n"
+
+              "fsub g22.4s,  g22.4s, FZ42.4s\n"
+
+              "subs channel, channel, #4\n"
+
+              "str qg21, [%x[outptr], row_stride]\n"
+
+              // Progress input pointers to the next row of the matrix
+              "add  %x[inptr0],  %x[inptr0], %x[mrowpad]\n"
+              "add  %x[inptr4],  %x[inptr4], %x[mrowpad]\n"
+              "add  %x[inptr8],  %x[inptr8], %x[mrowpad]\n"
+              "add %x[inptr12], %x[inptr12], %x[mrowpad]\n"
+
+              "str qg22, [%x[outptr], row_plus_col_stride]\n"
+
+              "add %x[outptr], %x[outptr], #0x10\n"
+
+
+            "add %x[outptr], %x[outptr], col_stride\n"
+            "subs tile_j, tile_j, #1\n"
+            "bne 3b\n"
+
+          "add %x[outptr], %x[outptr], row_stride\n"
+          "subs tile_i, tile_i, #1\n"
+          "bne 2b\n"
+
+        "subs %w[batch], %w[batch], #1\n"
+        "bne 1b\n"
+
+      ".unreq  F11\n" ".unreq qF11\n"
+      ".unreq  F12\n" ".unreq qF12\n"
+      ".unreq  F13\n" ".unreq qF13\n"
+      ".unreq  F14\n" ".unreq qF14\n"
+      ".unreq  F21\n" ".unreq qF21\n"
+      ".unreq  F22\n" ".unreq qF22\n"
+      ".unreq  F23\n" ".unreq qF23\n"
+      ".unreq  F24\n" ".unreq qF24\n"
+      ".unreq  F31\n" ".unreq qF31\n"
+      ".unreq  F32\n" ".unreq qF32\n"
+      ".unreq  F33\n" ".unreq qF33\n"
+      ".unreq  F34\n" ".unreq qF34\n"
+      ".unreq  F41\n" ".unreq qF41\n"
+      ".unreq  F42\n" ".unreq qF42\n"
+      ".unreq  F43\n" ".unreq qF43\n"
+      ".unreq  F44\n" ".unreq qF44\n"
+
+      ".unreq FZ11\n" ".unreq FZ12\n"
+      ".unreq FZ21\n" ".unreq FZ22\n"
+      ".unreq FZ31\n" ".unreq FZ32\n"
+      ".unreq FZ41\n" ".unreq FZ42\n"
+
+      ".unreq  g11\n" ".unreq qg11\n"
+      ".unreq  g12\n" ".unreq qg12\n"
+      ".unreq  g21\n" ".unreq qg21\n"
+      ".unreq  g22\n" ".unreq qg22\n"
+
+      ".unreq col_stride\n"
+      ".unreq row_stride\n"
+      ".unreq row_plus_col_stride\n"
+
+      ".unreq mstride1\n"
+      ".unreq mstride2\n"
+      ".unreq mstride3\n"
+
+      ".unreq tile_i \n"
+      ".unreq tile_j \n"
+      ".unreq channel\n"
+
+    : [batch] "+r" (batch),
+      [outptr] "+r" (outptr),
+      [inptr0] "+r" (inptr0),
+      [inptr4] "+r" (inptr4),
+      [inptr8] "+r" (inptr8),
+      [inptr12] "+r" (inptr12)
+    : [tile_M] "r" (tile_M),
+      [tile_N] "r" (tile_N),
+      [n_channels] "r" (output_shape.n_channels),
+      [col_stride] "r" (col_stride),
+      [row_stride] "r" (row_stride),
+      [row_plus_col_stride] "r" (row_stride + col_stride),
+      [mstride1] "r" (mstride * sizeof(float)),
+      [mstride2] "r" (2 * mstride * sizeof(float)),
+      [mstride3] "r" (3 * mstride * sizeof(float)),
+      [mrowpad] "r" ((matrix_row_stride - output_shape.n_channels) * sizeof(float))
+    : "x19", "x20", "x21",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
+      "v22", "v23", "v24", "v25", "v26", "v27",
+      "cc", "memory"
+  );
+}
+
+template <>
+template <bool tail_M, bool tail_N, const int channel_tail>
+inline void Winograd2x2_3x3GemmOutput<float>::_execute(
+    const Tensor4DShape &output_shape,
+    float *output,
+    const float *input,
+    const int mstride,
+    const int matrix_row_stride
+) {
+  // Compute basic information about the shape of the matrices
+  const int tile_M = output_shape.n_rows / 2;
+  const int tile_N = output_shape.n_cols / 2;
+  const int n_channels = output_shape.n_channels;
+
+  // Extract 16 input pointers
+  const float* inptr[16];
+  for (int i = 0; i < 16; i++) {
+    inptr[i] = input + i*mstride;
+  }
+
+  // Extract 4 output pointers
+  float *outptr00 = output;
+  float *outptr01 = outptr00 + n_channels;
+  float *outptr10 = outptr00 + output_shape.n_cols * n_channels;
+  float *outptr11 = outptr10 + n_channels;
+
+  // Progress over the output tiles, generating output values.
+  for (int batch = 0; batch < output_shape.n_batches; batch++) {
+    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          float F[4][4];
+          for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+              F[i][j] = *(inptr[i*4 + j]++);
+            }
+          }
+
+          // Compute the matrix F.Z
+          float ZF[4][2];
+          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
+          ZF[0][1] = F[0][1] - F[0][2] - F[0][3];
+          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
+          ZF[1][1] = F[1][1] - F[1][2] - F[1][3];
+          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
+          ZF[2][1] = F[2][1] - F[2][2] - F[2][3];
+          ZF[3][0] = F[3][0] + F[3][1] + F[3][2];
+          ZF[3][1] = F[3][1] - F[3][2] - F[3][3];
+
+          // Hence compute the output matrix Z^T . (F.Z)
+          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
+          *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1];
+          *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0];
+          *(outptr11++) = ZF[1][1] - ZF[2][1] - ZF[3][1];
+        }
+
+        // Progress the input pointers to the next row
+        for (int i = 0; i < 16; i++) {
+          inptr[i] += matrix_row_stride - n_channels;
+        }
+
+        // Progress the output pointers to the next column
+        outptr00 += n_channels;
+        outptr01 += n_channels;
+        outptr10 += n_channels;
+        outptr11 += n_channels;
+      }
+
+      if (tail_N) {
+        // Only evaluate the left-most columns of the output
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          float F[4][3];
+          for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 3; j++) {
+              F[i][j] = *(inptr[i*4 + j]++);
+            }
+          }
+          for (int i = 0; i < 4; i++) {
+            inptr[i*4 + 3]++;
+          }
+
+          // Compute the matrix F.Z
+          float ZF[4][1];
+          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
+          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
+          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
+          ZF[3][0] = F[3][0] + F[3][1] + F[3][2];
+
+          // Hence compute the output matrix Z^T . (F.Z)
+          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
+          *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0];
+        }
+
+        // Progress the input pointers to the next row
+        for (int i = 0; i < 16; i++) {
+          inptr[i] += matrix_row_stride - n_channels;
+        }
+
+        // Progress the output pointers to the next column
+        outptr01 += n_channels;  // Account for being skipped above
+        outptr11 += n_channels;  // Account for being skipped above
+      }
+
+      // Progress the output pointers to the next row
+      outptr00 += output_shape.n_cols * n_channels;
+      outptr01 += output_shape.n_cols * n_channels;
+      outptr10 += output_shape.n_cols * n_channels;
+      outptr11 += output_shape.n_cols * n_channels;
+    }
+
+    if (tail_M) {
+      // Only work on the upper row of the output
+      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          float F[3][4];
+          for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 4; j++) {
+              F[i][j] = *(inptr[i*4 + j]++);
+            }
+          }
+          for (int j = 0; j < 4; j++) {
+            inptr[12 + j]++;
+          }
+
+          // Compute the matrix F.Z
+          float ZF[3][2];
+          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
+          ZF[0][1] = F[0][1] - F[0][2] - F[0][3];
+          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
+          ZF[1][1] = F[1][1] - F[1][2] - F[1][3];
+          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
+          ZF[2][1] = F[2][1] - F[2][2] - F[2][3];
+
+          // Hence compute the output matrix Z^T . (F.Z)
+          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
+          *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1];
+        }
+
+        // Progress the input pointers to the next row
+        for (int i = 0; i < 16; i++) {
+          inptr[i] += matrix_row_stride - n_channels;
+        }
+
+        // Progress the output pointers to the next column
+        outptr00 += n_channels;
+        outptr01 += n_channels;
+        outptr10 += 2 * n_channels;  // Account for being skipped above
+        outptr11 += 2 * n_channels;  // Account for being skipped above
+      }
+
+      if (tail_N) {
+        // Only evaluate the upper-left cell of the output
+        for (int channel = 0; channel < n_channels; channel++) {
+          // Read values from the input pointers
+          float F[3][3];
+          for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+              F[i][j] = *(inptr[i*4 + j]);
+            }
+          }
+          for (int i = 0; i < 16; i++) {
+            inptr[i]++;
+          }
+
+          // Compute the matrix F.Z
+          float ZF[3][1];
+          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
+          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
+          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
+
+          // Hence compute the output matrix Z^T . (F.Z)
+          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
+        }
+
+        // Progress the input pointers to the next row
+        for (int i = 0; i < 16; i++) {
+          inptr[i] += matrix_row_stride - n_channels;
+        }
+
+        // Progress the output pointers to the next column
+        outptr01 += n_channels;  // Account for being skipped above
+        outptr10 += n_channels;  // Account for being skipped above
+        outptr11 += n_channels;  // Account for being skipped above
+      }
+    }
+  }
+}
+
+/*****************************************************************************/
+template <>
+inline void Winograd2x2_3x3GemmOutput<float>::execute(
+    const Tensor4DShape &output_shape,
+    float* const matrix_base,
+    const int matrix_stride,
+    const int matrix_row_stride,
+    float* const output
+) {
+  // Dispatch to an appropriate implementation based on the shape of the output
+  // tensor.
+  if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
+    constexpr bool tail_M = true, tail_N = true;
+    switch (output_shape.n_channels % 4) {
+      case 0:
+        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 1:
+        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 2:
+        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 3:
+        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  } else if (output_shape.n_rows % 2) {
+    constexpr bool tail_M = true, tail_N = false;
+    switch (output_shape.n_channels % 4) {
+      case 0:
+        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 1:
+        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 2:
+        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 3:
+        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  } else if (output_shape.n_cols % 2) {
+    constexpr bool tail_M = false, tail_N = true;
+    switch (output_shape.n_channels % 4) {
+      case 0:
+        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 1:
+        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 2:
+        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 3:
+        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      default:
+        assert(0);
+        break;
+
+    }
+  } else {
+    constexpr bool tail_M = false, tail_N = false;
+    switch (output_shape.n_channels % 4) {
+      case 0:
+        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 1:
+        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 2:
+        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      case 3:
+        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
+        break;
+      default:
+        assert(0);
+        break;
+
+    }
+  }
+}
+/*****************************************************************************/
+
+}  // namespace winograd
+#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp
new file mode 100644
index 0000000..f551b12
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp

@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __aarch64__
+
+/*****************************************************************************/
+// Compute ZF specializations
+
+template <>
+template <>
+inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::compute_zf<0>(
+    const int n_rows, const int n_channels,
+    float* output, const float* const input[16]
+) {
+  // Make copies of some variables
+  int row = n_rows;
+  float* outptr = output;
+  const float* inptr = input[0];
+
+  // Perform the transformation
+  asm volatile (
+    // "inptr0 .req %x[inptr]\n"
+    "inptr1 .req x0\n"
+    "inptr2 .req x1\n"
+    "inptr3 .req x2\n"
+    "inptr4 .req x3\n"
+    "inptr5 .req x4\n"
+    "inptr6 .req x5\n"
+    "inptr7 .req x6\n"
+    "inptr8 .req x7\n"
+    "inptr9 .req x8\n"
+    "inptr10 .req x9\n"
+    "inptr11 .req x10\n"
+    "inptr12 .req x11\n"
+    "inptr13 .req x12\n"
+    "inptr14 .req x13\n"
+    "inptr15 .req x14\n"
+
+    // "outptr0 .req %x[outptr]\n"
+    "outptr1 .req x15\n"
+    "outptr2 .req x16\n"
+    "outptr3 .req x17\n"
+    "outptr4 .req x18\n"
+    "outptr5 .req x19\n"
+    "outptr6 .req x20\n"
+    "outptr7 .req x21\n"
+
+    // Compute additional pointers into the input and output matrices.
+    "mstride .req x22\n"  // Matrix stride
+    "mul mstride, %x[row], %x[n_channels]\n"
+    "lsl mstride, mstride, #2\n"  // * sizeof(float)
+
+    "add inptr1, %x[inptr], mstride\n"
+    "add inptr2, %x[inptr], mstride, LSL #1\n"
+    "add inptr3, inptr2, mstride\n"
+    "add inptr4, inptr3, mstride\n"
+    "add inptr5, inptr4, mstride\n"
+    "add inptr6, inptr5, mstride\n"
+    "add inptr7, inptr6, mstride\n"
+    "add inptr8, inptr7, mstride\n"
+    "add inptr9, inptr8, mstride\n"
+    "add inptr10, inptr9, mstride\n"
+    "add inptr11, inptr10, mstride\n"
+    "add inptr12, inptr11, mstride\n"
+    "add inptr13, inptr12, mstride\n"
+    "add inptr14, inptr13, mstride\n"
+    "add inptr15, inptr14, mstride\n"
+
+    "add outptr1, %[outptr], mstride\n"
+    "add outptr2, outptr1, mstride\n"
+    "add outptr3, outptr2, mstride\n"
+    "add outptr4, outptr3, mstride\n"
+    "add outptr5, outptr4, mstride\n"
+    "add outptr6, outptr5, mstride\n"
+    "add outptr7, outptr6, mstride\n"
+
+    ".unreq mstride\n"
+
+    "column .req x22\n"  // Column loop counter
+
+    "1:"  // Loop over rows
+      "ldr q0, [%x[inptr]], #0x10\n"
+      "ldr q1, [inptr1], #0x10\n"
+      "ldr q2, [inptr2], #0x10\n"
+      "ldr q3, [inptr3], #0x10\n"
+      "ldr q4, [inptr4], #0x10\n"
+      "ldr q5, [inptr5], #0x10\n"
+      "ldr q6, [inptr6], #0x10\n"
+      "ldr q7, [inptr7], #0x10\n"
+      "subs column, %x[n_channels], #0x4\n"
+      "beq 3f\n"
+
+      "2:"  // Loop over columns
+        "ldr q8, [inptr8], #0x10\n"
+        "prfm pldl1keep, [%x[inptr], #196]\n"
+        "fadd v16.4s, v0.4s, v1.4s\n"
+
+        "ldr q9, [inptr9], #0x10\n"
+        "prfm pldl1keep, [inptr1, #196]\n"
+        "fsub v17.4s, v1.4s, v2.4s\n"
+
+        "ldr q10, [inptr10], #0x10\n"
+        "prfm pldl1keep, [inptr2, #196]\n"
+        "fadd v16.4s, v16.4s, v2.4s\n"
+
+        "ldr q11, [inptr11], #0x10\n"
+        "prfm pldl1keep, [inptr3, #196]\n"
+        "fsub v17.4s, v17.4s, v3.4s\n"
+
+        "ldr q12, [inptr12], #0x10\n"
+        "prfm pldl1keep, [inptr4, #196]\n"
+        "str q16, [%x[outptr]], #0x10\n"
+
+        "ldr q13, [inptr13], #0x10\n"
+        "prfm pldl1keep, [inptr5, #196]\n"
+        "str q17, [outptr1], #0x10\n"
+
+        "ldr q14, [inptr14], #0x10\n"
+        "prfm pldl1keep, [inptr6, #196]\n"
+        "fadd v16.4s, v4.4s, v5.4s\n"
+
+        "ldr q15, [inptr15], #0x10\n"
+        "prfm pldl1keep, [inptr7, #196]\n"
+        "fsub v17.4s, v5.4s, v6.4s\n"
+
+        "ldr q0, [%x[inptr]], #0x10\n"
+        "prfm pldl1keep, [inptr8, #196]\n"
+        "fadd v16.4s, v16.4s, v6.4s\n"
+
+        "ldr q1, [inptr1], #0x10\n"
+        "prfm pldl1keep, [inptr9, #196]\n"
+        "fsub v17.4s, v17.4s, v7.4s\n"
+
+        "ldr q2, [inptr2], #0x10\n"
+        "prfm pldl1keep, [inptr10, #196]\n"
+        "str q16, [outptr2], #0x10\n"
+
+        "ldr q3, [inptr3], #0x10\n"
+        "prfm pldl1keep, [inptr11, #196]\n"
+        "str q17, [outptr3], #0x10\n"
+
+        "ldr q4, [inptr4], #0x10\n"
+        "prfm pldl1keep, [inptr12, #196]\n"
+        "fadd v16.4s, v8.4s, v9.4s\n"
+
+        "ldr q5, [inptr5], #0x10\n"
+        "prfm pldl1keep, [inptr13, #196]\n"
+        "fsub v17.4s, v9.4s, v10.4s\n"
+
+        "ldr q6, [inptr6], #0x10\n"
+        "prfm pldl1keep, [inptr14, #196]\n"
+        "fadd v16.4s, v16.4s, v10.4s\n"
+
+        "ldr q7, [inptr7], #0x10\n"
+        "prfm pldl1keep, [inptr15, #196]\n"
+        "fsub v17.4s, v17.4s, v11.4s\n"
+
+        "str q16, [outptr4], #0x10\n"
+        "fadd v16.4s, v12.4s, v13.4s\n"
+        "fsub v18.4s, v13.4s, v14.4s\n"
+
+        "str q17, [outptr5], #0x10\n"
+        "fadd v16.4s, v16.4s, v14.4s\n"
+        "fsub v18.4s, v18.4s, v15.4s\n"
+
+        "str q16, [outptr6], #0x10\n"
+        "subs column, column, #0x4\n"
+
+        "str q18, [outptr7], #0x10\n"
+        "bne 2b\n"
+
+      "3:"  // Tail
+        "ldr q8, [inptr8], #0x10\n"
+        "prfm pldl1keep, [%x[inptr], #196]\n"
+        "fadd v16.4s, v0.4s, v1.4s\n"
+
+        "ldr q9, [inptr9], #0x10\n"
+        "prfm pldl1keep, [inptr1, #196]\n"
+        "fsub v17.4s, v1.4s, v2.4s\n"
+
+        "ldr q10, [inptr10], #0x10\n"
+        "prfm pldl1keep, [inptr2, #196]\n"
+        "fadd v16.4s, v16.4s, v2.4s\n"
+
+        "ldr q11, [inptr11], #0x10\n"
+        "prfm pldl1keep, [inptr3, #196]\n"
+        "fsub v17.4s, v17.4s, v3.4s\n"
+
+        "ldr q12, [inptr12], #0x10\n"
+        "prfm pldl1keep, [inptr4, #196]\n"
+        "str q16, [%x[outptr]], #0x10\n"
+
+        "ldr q13, [inptr13], #0x10\n"
+        "prfm pldl1keep, [inptr5, #196]\n"
+        "str q17, [outptr1], #0x10\n"
+
+        "ldr q14, [inptr14], #0x10\n"
+        "prfm pldl1keep, [inptr6, #196]\n"
+        "fadd v16.4s, v4.4s, v5.4s\n"
+
+        "ldr q15, [inptr15], #0x10\n"
+        "prfm pldl1keep, [inptr7, #196]\n"
+        "fsub v17.4s, v5.4s, v6.4s\n"
+
+        "prfm pldl1keep, [inptr8, #196]\n"
+        "prfm pldl1keep, [inptr9, #196]\n"
+        "fadd v16.4s, v16.4s, v6.4s\n"
+
+        "prfm pldl1keep, [inptr10, #196]\n"
+        "prfm pldl1keep, [inptr11, #196]\n"
+        "fsub v17.4s, v17.4s, v7.4s\n"
+
+        "prfm pldl1keep, [inptr12, #196]\n"
+        "prfm pldl1keep, [inptr13, #196]\n"
+        "str q16, [outptr2], #0x10\n"
+
+        "prfm pldl1keep, [inptr14, #196]\n"
+        "prfm pldl1keep, [inptr15, #196]\n"
+        "str q17, [outptr3], #0x10\n"
+
+        "fadd v16.4s, v8.4s, v9.4s\n"
+        "fsub v17.4s, v9.4s, v10.4s\n"
+
+        "fadd v16.4s, v16.4s, v10.4s\n"
+        "fsub v17.4s, v17.4s, v11.4s\n"
+
+        "str q16, [outptr4], #0x10\n"
+        "fadd v16.4s, v12.4s, v13.4s\n"
+        "fsub v18.4s, v13.4s, v14.4s\n"
+
+        "str q17, [outptr5], #0x10\n"
+        "fadd v16.4s, v16.4s, v14.4s\n"
+        "fsub v18.4s, v18.4s, v15.4s\n"
+
+        "str q16, [outptr6], #0x10\n"
+        "str q18, [outptr7], #0x10\n"
+
+      "subs %x[row], %x[row], #0x1\n"
+      "bne 1b\n"
+
+    ".unreq inptr1\n"
+    ".unreq inptr2\n"
+    ".unreq inptr3\n"
+    ".unreq inptr4\n"
+    ".unreq inptr5\n"
+    ".unreq inptr6\n"
+    ".unreq inptr7\n"
+    ".unreq inptr8\n"
+    ".unreq inptr9\n"
+    ".unreq inptr10\n"
+    ".unreq inptr11\n"
+    ".unreq inptr12\n"
+    ".unreq inptr13\n"
+    ".unreq inptr14\n"
+    ".unreq inptr15\n"
+    ".unreq outptr1\n"
+    ".unreq outptr2\n"
+    ".unreq outptr3\n"
+    ".unreq outptr4\n"
+    ".unreq outptr5\n"
+    ".unreq outptr6\n"
+    ".unreq outptr7\n"
+
+    : [row] "+r" (row),
+      [inptr] "+r" (inptr),
+      [outptr] "+r" (outptr)
+    : [n_channels] "r" (n_channels),
+      [sizeof_float] "i" (sizeof(float))
+    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+      "q12", "q13", "q14", "q15", "q16", "q17", "x0", "x1", "x2", "x3", "x4",
+      "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "cc", "memory"
+  );
+}
+
+/*****************************************************************************/
+// Compute ZFZ^T specializations
+
+template <>
+template <>
+inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::compute_zfzT<false, false, 0>(
+    const Tensor4DShape &output_shape,
+    float* const output, const float* const input
+) {
+  const int tile_M = output_shape.n_rows / 2;
+  const int tile_N = output_shape.n_cols / 2;
+  int batch = output_shape.n_batches;
+  float *outptr = output;
+  const float *inptr = input;
+
+  asm volatile (
+    // Compute input pointers
+    "inptr1 .req x0\n"
+    "inptr2 .req x1\n"
+    "inptr3 .req x2\n"
+    "inptr4 .req x3\n"
+    "inptr5 .req x4\n"
+    "inptr6 .req x5\n"
+    "inptr7 .req x6\n"
+    "inptr8 .req x7\n"
+
+    "mstride .req x8\n"
+    "mul mstride, %x[tile_M], %x[tile_N]\n"
+    "mul mstride, mstride, %x[n_channels]\n"
+    "lsl mstride, mstride, #2\n"  // * sizeof(float)
+
+    "add inptr1, %[inptr], mstride\n"
+    "add inptr2, inptr1, mstride\n"
+    "add inptr3, inptr2, mstride\n"
+    "add inptr4, inptr3, mstride\n"
+    "add inptr5, inptr4, mstride\n"
+    "add inptr6, inptr5, mstride\n"
+    "add inptr7, inptr6, mstride\n"
+    "add inptr8, inptr7, mstride\n"
+
+    ".unreq mstride\n"
+
+    // Compute initial output pointers
+    "outptr01 .req  x8\n"
+    "outptr10 .req  x9\n"
+    "outptr11 .req x10\n"
+
+    "add outptr01, %x[outptr], %x[n_channels], LSL #2\n"
+    "add outptr10, %x[outptr], %x[row_stride], LSL #2\n"
+    "add outptr11,   outptr10, %x[n_channels], LSL #2\n"
+
+    "tile_i  .req x11\n"
+    "tile_j  .req x12\n"
+    "channel .req x13\n"
+
+    "1:"  // Loop over batches
+      "mov tile_i, %x[tile_M]\n"
+
+      "2:"  // Loop over rows of output tiles
+        "mov tile_j, %x[tile_N]\n"
+
+        "3:"  // Loop over columns of output tiles
+          "ldr q0, [%x[inptr]], #0x10\n"
+          "ldr q2, [inptr2], #0x10\n"
+          "subs channel, %x[n_channels], #0x4\n"
+
+          "ldr q1, [inptr1], #0x10\n"
+          "ldr q3, [inptr3], #0x10\n"
+          "beq 6f\n"
+
+          "4:"
+            "ldr q4, [inptr4], #0x10\n"
+            "ldr q5, [inptr5], #0x10\n"
+            "fadd v16.4s, v0.4s, v2.4s\n"
+
+            "ldr q6, [inptr6], #0x10\n"
+            "ldr q7, [inptr7], #0x10\n"
+            "fadd v17.4s, v1.4s, v3.4s\n"
+
+            "ldr q8, [%x[inptr]], #0x10\n"
+            "ldr q10, [inptr2], #0x10\n"
+            "fadd v16.4s, v16.4s, v4.4s\n"
+
+            "ldr q9, [inptr1], #0x10\n"
+            "ldr q11, [inptr3], #0x10\n"
+            "fadd v17.4s, v17.4s, v5.4s\n"
+
+            "str q16, [%x[outptr]], #0x10\n"
+            "prfm pldl1strm, [%x[inptr], #196]\n"
+            "fsub v18.4s, v2.4s, v4.4s\n"
+
+            "str q17, [outptr01], #0x10\n"
+            "prfm pldl1strm, [inptr2, #196]\n"
+            "fsub v19.4s, v3.4s, v5.4s\n"
+
+            "prfm pldl1strm, [inptr1, #196]\n"
+            "prfm pldl1strm, [inptr3, #196]\n"
+            "fsub v18.4s, v18.4s, v6.4s\n"
+
+            "prfm pldl1strm, [inptr4, #196]\n"
+            "prfm pldl1strm, [inptr5, #196]\n"
+            "fsub v19.4s, v19.4s, v7.4s\n"
+
+            "str q18, [outptr10], #0x10\n"
+            "prfm pldl1strm, [inptr6, #196]\n"
+            "prfm pldl1strm, [inptr7, #196]\n"
+
+            "subs channel, channel, #0x4\n"
+
+            "str q19, [outptr11], #0x10\n"
+            "beq 6f\n"  // Branch to tail
+
+            "ldr q12, [inptr4], #0x10\n"
+            "ldr q13, [inptr5], #0x10\n"
+            "fadd v16.4s, v8.4s, v10.4s\n"
+
+            "ldr q14, [inptr6], #0x10\n"
+            "ldr q15, [inptr7], #0x10\n"
+            "fadd v17.4s, v9.4s, v11.4s\n"
+
+            "ldr q0, [%x[inptr]], #0x10\n"
+            "ldr q2, [inptr2], #0x10\n"
+            "fadd v16.4s, v16.4s, v12.4s\n"
+
+            "ldr q1, [inptr1], #0x10\n"
+            "ldr q3, [inptr3], #0x10\n"
+            "fadd v17.4s, v17.4s, v13.4s\n"
+
+            "str q16, [%x[outptr]], #0x10\n"
+            "prfm pldl1strm, [%x[inptr], #196]\n"
+            "fsub v18.4s, v10.4s, v12.4s\n"
+
+            "str q17, [outptr01], #0x10\n"
+            "prfm pldl1strm, [inptr2, #196]\n"
+            "fsub v19.4s, v11.4s, v13.4s\n"
+
+            "prfm pldl1strm, [inptr1, #196]\n"
+            "prfm pldl1strm, [inptr3, #196]\n"
+            "fsub v18.4s, v18.4s, v14.4s\n"
+
+            "prfm pldl1strm, [inptr4, #196]\n"
+            "prfm pldl1strm, [inptr5, #196]\n"
+            "fsub v19.4s, v19.4s, v15.4s\n"
+
+            "str q18, [outptr10], #0x10\n"
+            "prfm pldl1strm, [inptr6, #196]\n"
+            "prfm pldl1strm, [inptr7, #196]\n"
+
+            "subs channel, channel, #0x4\n"
+
+            "str q19, [outptr11], #0x10\n"
+            "bne 4b\n"  // Continue loop
+
+          "5:"  // Tail
+            "ldr q12, [inptr4], #0x10\n"
+            "ldr q13, [inptr5], #0x10\n"
+            "fadd v16.4s, v8.4s, v10.4s\n"
+
+            "ldr q14, [inptr6], #0x10\n"
+            "ldr q15, [inptr7], #0x10\n"
+            "fadd v17.4s, v9.4s, v11.4s\n"
+
+            "fadd v16.4s, v16.4s, v12.4s\n"
+
+            "fadd v17.4s, v17.4s, v13.4s\n"
+
+            "str q16, [%x[outptr]], #0x10\n"
+            "fsub v18.4s, v10.4s, v12.4s\n"
+            "fsub v19.4s, v11.4s, v13.4s\n"
+
+            "str q17, [outptr01], #0x10\n"
+            "fsub v18.4s, v18.4s, v14.4s\n"
+            "fsub v19.4s, v19.4s, v15.4s\n"
+
+            "str q18, [outptr10], #0x10\n"
+            "str q19, [outptr11], #0x10\n"
+            "b 7f\n"
+
+          "6:"  // Tail
+            "ldr q4, [inptr4], #0x10\n"
+            "ldr q5, [inptr5], #0x10\n"
+            "fadd v16.4s, v0.4s, v2.4s\n"
+
+            "ldr q6, [inptr6], #0x10\n"
+            "ldr q7, [inptr7], #0x10\n"
+            "fadd v17.4s, v1.4s, v3.4s\n"
+
+            "fadd v16.4s, v16.4s, v4.4s\n"
+
+            "fadd v17.4s, v17.4s, v5.4s\n"
+
+            "str q16, [%x[outptr]], #0x10\n"
+            "fsub v18.4s, v2.4s, v4.4s\n"
+            "fsub v19.4s, v3.4s, v5.4s\n"
+
+            "str q17, [outptr01], #0x10\n"
+            "fsub v18.4s, v18.4s, v6.4s\n"
+            "fsub v19.4s, v19.4s, v7.4s\n"
+
+            "str q18, [outptr10], #0x10\n"
+            "str q19, [outptr11], #0x10\n"
+
+          "7:"
+            "add %x[outptr], %x[outptr], %x[n_channels], LSL #2\n"
+            "add outptr01, outptr01, %x[n_channels], LSL #2\n"
+            "add outptr10, outptr10, %x[n_channels], LSL #2\n"
+            "add outptr11, outptr11, %x[n_channels], LSL #2\n"
+
+            "subs tile_j, tile_j, #1\n"
+            "bne 3b\n"
+
+        // Progress the output pointers to the new row
+        "add %x[outptr], %x[outptr], %x[row_stride], LSL #2\n"
+        "add   outptr01,   outptr01, %x[row_stride], LSL #2\n"
+        "add   outptr10,   outptr10, %x[row_stride], LSL #2\n"
+        "add   outptr11,   outptr11, %x[row_stride], LSL #2\n"
+
+        "subs tile_i, tile_i, #1\n"
+        "bne 2b\n"
+
+      "subs %[batch], %[batch], #1\n"
+      "bne 1b\n"
+      "5:"
+
+    ".unreq inptr1\n"
+    ".unreq inptr2\n"
+    ".unreq inptr3\n"
+    ".unreq inptr4\n"
+    ".unreq inptr5\n"
+    ".unreq inptr6\n"
+    ".unreq inptr7\n"
+    ".unreq inptr8\n"
+    ".unreq outptr01\n"
+    ".unreq outptr10\n"
+    ".unreq outptr11\n"
+    : [batch] "+r" (batch),
+      [outptr] "+r" (outptr),
+      [inptr] "+r" (inptr)
+    : [tile_M] "r" (tile_M),
+      [tile_N] "r" (tile_N),
+      [n_channels] "r" (output_shape.n_channels),
+      [row_stride] "r" (output_shape.n_cols * output_shape.n_channels)
+    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
+      "x12", "x13", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+      "cc", "memory"
+  );
+}
+/*****************************************************************************/
+
+/*****************************************************************************/
+template <>
+inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::execute(
+    const Tensor4DShape &output_shape,
+    float* const matrices[16], float* const output
+) {
+  // profiler prof;
+
+  // Allocate memory for the intermediate matrices
+  const int tile_M = iceildiv(output_shape.n_rows, 2);
+  const int tile_N = iceildiv(output_shape.n_cols, 2);
+  const int n_rows = output_shape.n_batches * tile_M * tile_N;
+  const int n_channels = output_shape.n_channels;
+  float* matrices_zf = reinterpret_cast<float*>(
+    calloc(8 * n_rows * n_channels, sizeof(float))
+  );
+  
+  // Perform the first stage transform, computing ZF.
+  const auto f_compute_zf = [&] () {
+    switch (n_channels % 4) {
+      case 0:
+        compute_zf<0>(n_rows, n_channels, matrices_zf, matrices);
+        break;
+      case 1:
+        compute_zf<1>(n_rows, n_channels, matrices_zf, matrices);
+        break;
+      case 2:
+        compute_zf<2>(n_rows, n_channels, matrices_zf, matrices);
+        break;
+      case 3:
+        compute_zf<3>(n_rows, n_channels, matrices_zf, matrices);
+    };
+  };
+  // prof("Compute ZF", f_compute_zf, 16 * n_rows * n_channels * sizeof(float), 0, 8 * n_rows * n_channels * sizeof(float));
+  f_compute_zf();
+  
+  // Perform the second stage transform, finishing Z F Z^T - variable dispatch
+  // based on size of the output and the channel tail.
+  const auto f_compute_zfzT = [&] () {
+    if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
+      constexpr bool tail_M = true, tail_N = true;
+      switch (n_channels % 4) {
+        case 0:
+          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
+          break;
+        case 1:
+          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
+          break;
+        case 2:
+          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
+          break;
+        case 3:
+          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
+      }
+    } else if (output_shape.n_rows % 2) {
+      constexpr bool tail_M = true, tail_N = false;
+      switch (n_channels % 4) {
+        case 0:
+          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
+          break;
+        case 1:
+          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
+          break;
+        case 2:
+          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
+          break;
+        case 3:
+          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
+      }
+    } else if (output_shape.n_cols % 2) {
+      constexpr bool tail_M = false, tail_N = true;
+      switch (n_channels % 4) {
+        case 0:
+          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
+          break;
+        case 1:
+          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
+          break;
+        case 2:
+          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
+          break;
+        case 3:
+          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
+      }
+    } else {
+      constexpr bool tail_M = false, tail_N = false;
+      switch (n_channels % 4) {
+        case 0:
+          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
+          break;
+        case 1:
+          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
+          break;
+        case 2:
+          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
+          break;
+        case 3:
+          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
+      }
+    }
+  };
+  // prof("Compute ZFZT", f_compute_zfzT, 8 * n_rows * n_channels * sizeof(float), 0, 4 * n_rows * n_channels * sizeof(float));
+  f_compute_zfzT();
+
+  free(reinterpret_cast<void*>(matrices_zf));
+}
+/*****************************************************************************/
+
+#endif  // __aarch64__

diff --git a/src/core/Logger.cpp b/src/core/NEON/kernels/winograd/utils.hpp
similarity index 64%
copy from src/core/Logger.cpp
copy to src/core/NEON/kernels/winograd/utils.hpp
index 9c3bf26..14e709f 100644
--- a/src/core/Logger.cpp
+++ b/src/core/NEON/kernels/winograd/utils.hpp

@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2017 ARM Limited.
  *
@@ -21,36 +22,34 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#pragma once
+#include <ctime>
 
-#include "arm_compute/core/Logger.h"
-
-using namespace arm_compute;
-
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
-{
+inline double TimeInUs(void) {
+#ifdef CYCLE_PROFILING
+  timespec t;
+  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
+  return 1e6*t.tv_sec + 1e-3*t.tv_nsec;
+#else
+  return 0;
+#endif
 }
 
-Logger &Logger::get()
-{
-    static Logger _instance;
-    return _instance;
+inline int iceildiv(const int a, const int b) {
+  return (a + b - 1) / b;
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
-{
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+template <typename T>
+inline T roundup(const T a, const T b) {
+  return a + b - (a % b);
 }
 
-std::ostream &Logger::log_info()
-{
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
+inline void PrintMatrix(const float* const m, const int M, const int N, const int row_stride) {
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      printf("%.3f ", m[i*row_stride + j]);
     }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file
+    printf("\n");
+  }
+  printf("\n");
+}

diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.hpp b/src/core/NEON/kernels/winograd/winograd_gemm.hpp
new file mode 100644
index 0000000..1ca3e31
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/winograd_gemm.hpp

@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include <cstdint>
+#include <cstdlib>
+#include <cassert>
+
+#include "gemm.hpp"
+#include "profiler.hpp"
+#include "utils.hpp"
+#include "shims.hpp"
+
+#include "transforms.hpp"
+
+namespace winograd {
+  /***************************************************************************/
+  /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM
+   * internally.
+   */
+  template <typename TOut, typename TIn>
+  class Winograd2x2_3x3GEMM {
+    public:
+      /* Instantiate a new Winograd operator.
+       */
+      Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
+      virtual ~Winograd2x2_3x3GEMM();
+
+      /** Transform the weights into the Winograd domain.
+       */
+      template <typename KernelTransform=winograd2x2_3x3_gemm_kernel_transform_impl<TIn>>
+      void transform_weights(const TIn* const kernel, void *transform_working_space);
+
+      /* Initializes matrices pointers, to be called once before execute()
+       */
+      template <typename InputTransform=Winograd2x2_3x3GemmInputChannelwise<TIn>>
+      void reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const TIn* const input, void* working_space);
+
+      /* Apply the Winograd operator to some input.
+       */
+      template <typename OutputTransform=Winograd2x2_3x3GemmOutput<TOut>>
+      void reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output);
+
+
+      /* Apply the Winograd operator to some input.
+       */
+      void execute(size_t first, size_t last);
+
+      /* Get the memory required to transform the kernel.
+       */
+      static inline size_t get_kernel_transform_working_size(const KernelShape &shape);
+
+      /* Get the output shape of a convolution.
+       */
+      static Tensor4DShape get_output_shape(const Tensor4DShape &input_shape, const KernelShape &k_shape,
+                                     const PaddingType padding_type);
+
+      /* Get the memory required to instantiate a new Winograd operator.
+       */
+      static size_t get_kernel_storage_size(const KernelShape &shape);
+
+      /* Get the memory required to apply a Winograd operator to some input.
+       */
+      static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape,
+                                    const PaddingType padding);
+
+
+      Winograd2x2_3x3GEMM(const Winograd2x2_3x3GEMM &) = delete;
+      /** Prevent instances of this class from being copied (As this class contains pointers) */
+      Winograd2x2_3x3GEMM &operator=(const Winograd2x2_3x3GEMM &) = delete;
+      /** Allow instances of this class to be moved */
+      Winograd2x2_3x3GEMM(Winograd2x2_3x3GEMM &&) = default;
+      /** Allow instances of this class to be moved */
+      Winograd2x2_3x3GEMM &operator=(Winograd2x2_3x3GEMM &&) = default;
+
+    protected:
+      /* Get the memory required by a single "input" matrix.
+       */
+      static size_t get_input_matrix_size(const Tensor4DShape &input_shape,const KernelShape &k_shape,
+                                   const PaddingType padding);
+
+      /* Get the memory required by a single "output" matrix.
+       */
+      static size_t get_output_matrix_size(const Tensor4DShape &input_shape, const KernelShape &k_shape,
+                                    const PaddingType padding);
+
+      /* Get the memory required by a single "kernel" matrix.
+       */
+      static size_t get_kernel_matrix_size(const KernelShape &shape);
+
+      const KernelShape kernel_shape;  // Shape of applied kernel
+      const Tensor4DShape in_shape;
+      const PaddingType padding;
+
+      const int kernel_matrix_row_stride;  // Stride within kernel matrix
+
+      const bool manage_kernel_storage;  // Free kernel storage when done
+      void* const _kernel_storage;  // Base pointer for kernel matrices
+
+      profiler prof;  // Profiler
+
+      TIn *kernel_matrices[16];  // Prepared form of kernel
+      TIn *input_matrices[16];
+      TOut *output_matrices[16];
+
+
+      static const int M_BLOCK = 4;
+      static const int N_BLOCK = 16;
+  };
+} // namespace winograd
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_transform_working_size(
+    const KernelShape &shape
+)
+{
+    // Need to re-order the kernel into HWIO form, require enough space to
+    // represent the tensor.
+    return sizeof(TIn) * shape.size();
+}
+
+
+template <typename TOut, typename TIn>
+template <typename KernelTransform>
+void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::transform_weights(
+  const TIn* const kernel,
+  void *transform_working_space
+)
+{
+    const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
+    int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage);
+    for (int i = 0; i < 16; i++) {
+        kernel_matrices[i] = reinterpret_cast<TIn *>(
+        ks_bytes + i*kernel_matrix_size_bytes);
+    }
+
+    const TIn *kernel_hwio = kernel;
+    if( transform_working_space)
+    {
+            kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
+            ofm_ifm_h_w_to_h_w_ifm_ofm(
+                  kernel, const_cast<TIn *>(kernel_hwio),
+                  kernel_shape.n_output_channels,
+                  kernel_shape.n_input_channels,
+                  kernel_shape.n_rows,
+                  kernel_shape.n_cols
+                );
+    }
+    KernelTransform::execute(
+      kernel_shape, kernel_hwio, kernel_matrices[0],
+      kernel_matrix_size_bytes / sizeof(TIn),
+      kernel_matrix_row_stride
+    );
+}
+
+template <typename TOut, typename TIn>
+winograd::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM( const KernelShape &kernel_shape, const Tensor4DShape input_shape,
+        const PaddingType padding_type, void *kernel_storage)
+    : kernel_shape(kernel_shape), in_shape(input_shape), padding(padding_type),kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), manage_kernel_storage(false),
+        _kernel_storage(kernel_storage), prof() {
+     memset(kernel_matrices, 0x00, sizeof(TIn)*16);
+     memset(input_matrices, 0x00, sizeof(TIn)*16);
+     memset(output_matrices, 0x00, sizeof(TOut)*16);
+}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+winograd::Winograd2x2_3x3GEMM<TOut, TIn>::~Winograd2x2_3x3GEMM() {}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+template <typename InputTransform>
+void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_input(
+    const Tensor4DShape& input_shape,
+    const PaddingType padding_type,
+    const TIn* const input,
+    void *working_space
+) {
+  assert(working_space);
+  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
+  // Split the working space into that required for 16 input matrices and
+  // output matrices.
+  const int in_matrix_stride_bytes = get_input_matrix_size(input_shape, kernel_shape, padding_type);
+  const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type);
+
+  for (int i = 0; i < 16; i++) {
+    input_matrices[i] = reinterpret_cast<TIn *>(
+        ws_bytes + i*in_matrix_stride_bytes);
+    output_matrices[i] = reinterpret_cast<TIn *>(
+        ws_bytes + 16*in_matrix_stride_bytes + i*out_matrix_stride_bytes);
+  }
+
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type);
+  const int tile_rows = iceildiv(output_shape.n_rows, 2);
+  const int tile_cols = iceildiv(output_shape.n_cols, 2);
+  const int K = kernel_shape.n_input_channels;
+
+  const int in_matrix_row_stride = K;
+  const int in_matrix_batch_stride = tile_rows*tile_cols*in_matrix_row_stride;
+
+  // Transform the input tensor into an appropriate form
+  auto input_prep = [&] () {
+    InputTransform::execute(
+      input, input_shape, padding_type, tile_rows, tile_cols,
+      input_matrices[0], in_matrix_stride_bytes / sizeof(TIn),
+      in_matrix_batch_stride, in_matrix_row_stride
+    );
+  };
+  prof(
+    "Input Prep", input_prep,
+    InputTransform::bytes_read(input_shape, output_shape),
+    InputTransform::flops_performed(input_shape, output_shape),
+    InputTransform::bytes_written(input_shape, output_shape)
+  );
+
+}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+template <typename OutputTransform>
+void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output) {
+  assert(output_matrices[0]);
+  const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type);
+  const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type);
+  const int out_matrix_row_stride = kernel_matrix_row_stride;
+
+  // Transform the output tensor into an appropriate form
+    OutputTransform::execute(
+      output_shape,
+      output_matrices[0],
+      out_matrix_stride_bytes / sizeof(TOut),
+      out_matrix_row_stride,
+      output
+    );
+}
+
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::execute( size_t first, size_t last ) {
+  assert(input_matrices[0] && kernel_matrices[0] && output_matrices[0]);
+  assert(first < 16 && last < 16 && first < last);
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(in_shape,kernel_shape, padding);
+  const int tile_rows = iceildiv(output_shape.n_rows, 2);
+  const int tile_cols = iceildiv(output_shape.n_cols, 2);
+  const int M = in_shape.n_batches * tile_rows * tile_cols;
+  const int K = kernel_shape.n_input_channels;
+  const int N = kernel_shape.n_output_channels;
+
+  const int in_matrix_row_stride = K;
+  const int out_matrix_row_stride = kernel_matrix_row_stride;
+  // Perform the GEMMs
+  for (size_t i = first; i <= last; i++) {
+      BlockedGemm<M_BLOCK, N_BLOCK>(
+        input_matrices[i], kernel_matrices[i], output_matrices[i], M, K, N,
+        in_matrix_row_stride, kernel_matrix_row_stride, out_matrix_row_stride
+      );
+  }
+
+}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+Tensor4DShape winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(
+    const Tensor4DShape &in_shape, const KernelShape &k_shape, const PaddingType padding)  {
+  return Tensor4DShape {
+    in_shape.n_batches,
+    (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2,
+    (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2,
+    k_shape.n_output_channels
+  };
+}
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_storage_size(
+    const KernelShape &shape) {
+  return 16 * get_kernel_matrix_size(shape);
+}
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_matrix_size(
+    const KernelShape &shape) {
+  const int K = shape.n_input_channels;
+  const int N = roundup(shape.n_output_channels, N_BLOCK);
+  return sizeof(TIn) * K * N;
+}
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
+    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
+)  {
+  return 16 * get_input_matrix_size(input_shape, k_shape, padding_type) +
+         16 * get_output_matrix_size(input_shape, k_shape, padding_type);
+}
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(
+    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
+)  {
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(input_shape, k_shape, padding_type);
+  const int tile_rows = iceildiv(output_shape.n_rows, 2);
+  const int tile_cols = iceildiv(output_shape.n_cols, 2);
+  const int M = roundup(tile_rows * tile_cols, M_BLOCK);
+  const int K = k_shape.n_input_channels;
+
+  return input_shape.n_batches * M * K * sizeof(TIn);
+}
+
+template <typename TOut, typename TIn>
+size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(
+    const Tensor4DShape& input_shape, const KernelShape &k_shape,const PaddingType padding_type
+)  {
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(input_shape, k_shape, padding_type);
+  const int tile_rows = iceildiv(output_shape.n_rows, 2);
+  const int tile_cols = iceildiv(output_shape.n_cols, 2);
+  const int M = roundup(tile_rows * tile_cols, M_BLOCK);
+  const int N = roundup(k_shape.n_output_channels, N_BLOCK);
+
+  return input_shape.n_batches * M * N * sizeof(TOut);
+}

diff --git a/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp b/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp
new file mode 100644
index 0000000..de201fe
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp

@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+#include <cstdint>
+#include <cstdlib>
+
+#include "gemm.hpp"
+#include "profiler.hpp"
+#include "utils.hpp"
+#include "shims.hpp"
+#include "winograd_gemm.hpp"
+
+#include "transforms.hpp"
+ 
+#ifndef ALLOC_ALIGN
+#define ALLOC_ALIGN 64
+#endif  // ALLOC_ALIGN
+
+
+namespace winograd_shim_nchw {
+  /***************************************************************************/
+  /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM
+   * internally.
+   */
+  template <typename TOut, typename TIn>
+  class Winograd2x2_3x3GEMM : public winograd::Winograd2x2_3x3GEMM<TOut, TIn> {
+    public:
+      /* Instantiate a new Winograd operator.
+       */
+      Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
+
+      void nchw2nhwc( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input);
+      void nhwc2nchw( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output);
+
+
+      std::pair<TOut*,TIn*> get_nhwc_ptrs(const Tensor4DShape& input_shape,const PaddingType padding_type,void *working_space);
+
+      static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, const PaddingType padding);
+    protected:
+      /* Get the memory required to store an NHWC copy of the input tensor. */
+      static size_t get_working_nhwc_input_size(const Tensor4DShape &input_shape);
+
+      /* Get the memory required to store an NHWC copy of the input tensor. */
+      static size_t get_working_nhwc_output_size(const Tensor4DShape &output_shape, const KernelShape &k_shape, const PaddingType padding) ;
+  };
+} // namespace winograd
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM(
+    const KernelShape &kernel_shape, const Tensor4DShape input_shape,
+        const PaddingType padding_type, void *kernel_storage
+) : winograd::Winograd2x2_3x3GEMM<TOut, TIn>(kernel_shape,input_shape,padding_type,kernel_storage) {
+}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nchw2nhwc(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input) {
+  assert(working_space);
+  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
+
+  // Extract the top chunk of the working space to store the input and output
+  // tensors in NHWC format.
+  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
+  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
+
+  // Allocate working space for the input and output in NHWC format
+  TIn* const input_nhwc = reinterpret_cast<TIn *>(
+      ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes)
+  );
+
+  // Re-order the input tensor
+  this->prof(
+    "NCHW -> NHWC",
+    [input, input_shape, input_nhwc] () {
+      nchw_to_nhwc(
+        input, input_nhwc,
+        input_shape.n_batches,
+        input_shape.n_channels,
+        input_shape.n_rows,
+        input_shape.n_cols
+      );
+    },
+    input_shape.size(), 0, input_shape.size()
+  );
+}
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nhwc2nchw(const Tensor4DShape& input_shape, const PaddingType padding_type, 
+            void *working_space, TOut* const output) {
+
+  assert(working_space);
+  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
+
+  // Extract the top chunk of the working space to store the input and output
+  // tensors in NHWC format.
+  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
+  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
+
+  TOut* const output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
+
+  // Re-order the output tensor into NCHW
+  const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape, this->kernel_shape, padding_type);
+  this->prof(
+    "NHWC -> NCHW",
+    [output_nhwc, output_shape, output] () {
+      nhwc_to_nchw(
+        output_nhwc, output,
+        output_shape.n_batches,
+        output_shape.n_rows,
+        output_shape.n_cols,
+        output_shape.n_channels
+      );
+    },
+    output_shape.size(), 0, output_shape.size()
+  );
+}
+
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+std::pair<TOut*,TIn*> winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_nhwc_ptrs(
+    const Tensor4DShape& input_shape,
+    const PaddingType padding_type,
+    void *working_space
+) {
+  assert(working_space);
+  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
+
+  // Extract the top chunk of the working space to store the input and output
+  // tensors in NHWC format.
+  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
+  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
+
+  // Allocate working space for the input and output in NHWC format
+  TIn* input_nhwc = reinterpret_cast<TIn *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes));
+  TOut* output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
+  return std::make_pair(output_nhwc,input_nhwc);
+}
+
+
+
+
+/*****************************************************************************/
+template <typename TOut, typename TIn>
+size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
+    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
+)  {
+  return winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
+      input_shape, k_shape, padding_type)
+      + get_working_nhwc_input_size(input_shape)
+      + get_working_nhwc_output_size(input_shape, k_shape, padding_type);
+}
+
+template <typename TOut, typename TIn>
+size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_input_size(
+    const Tensor4DShape& input_shape
+)  {
+  return roundup(input_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
+}
+
+template <typename TOut, typename TIn>
+size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_output_size(
+    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
+)  {
+  const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape,k_shape, padding_type);
+  return roundup(output_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
+}

diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
new file mode 100644
index 0000000..fea635b
--- /dev/null
+++ b/src/core/Rounding.cpp

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Rounding.h"
+
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+using namespace std;
+
+int arm_compute::round(float x, RoundingPolicy rounding_policy)
+{
+    using namespace std;
+    int rounded = 0;
+    switch(rounding_policy)
+    {
+        case RoundingPolicy::TO_ZERO:
+        {
+            rounded = static_cast<int>(x);
+            break;
+        }
+        case RoundingPolicy::TO_NEAREST_UP:
+        {
+            rounded = static_cast<int>(support::cpp11::round(x));
+            break;
+        }
+        case RoundingPolicy::TO_NEAREST_EVEN:
+        {
+            ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported rounding policy.");
+            break;
+        }
+    }
+
+    return rounded;
+}

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index f5a282d..8acd71c 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
@@ -34,11 +35,15 @@
 {
 }
 
-SubTensorInfo::SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords)
     : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+    // Check if subtensor is valid if parent is configured
+    if(parent->tensor_shape().total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
+    }
 
     // Initialize valid region
     Coordinates coordinates;
@@ -46,11 +51,27 @@
     _valid_region = ValidRegion{ coordinates, _tensor_shape };
 }
 
-void SubTensorInfo::set_tensor_shape(TensorShape shape)
+std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
+{
+    // Clone creates a TensorInfo object from SubTensorInfo's parent which will conclude to a TensorInfo
+    // For now it does not make sense to copy a SubTensorInfo explicitly
+    ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+    auto clone_obj = _parent->clone();
+    clone_obj->set_tensor_shape(_tensor_shape);
+    clone_obj->set_valid_region(_valid_region);
+    return clone_obj;
+}
+
+ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+    // Check if subtensor is valid if parent is configured
+    if(_parent->tensor_shape().total_size() != 0)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
+    }
     _tensor_shape = shape;
+    return *this;
 }
 
 bool SubTensorInfo::extend_padding(const PaddingSize &padding)

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 91a3531..60e76bf 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -26,13 +26,15 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 TensorInfo::TensorInfo()
     : _total_size(0), _fixed_point_position(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN),
-      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }
+      _is_resizable{ true }, _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info()
 {
 }
 
@@ -50,6 +52,7 @@
     _is_resizable                  = info.is_resizable();
     _valid_region                  = info.valid_region();
     _padding                       = info.padding();
+    _quantization_info             = info.quantization_info();
 }
 
 TensorInfo::TensorInfo(Format format)
@@ -80,6 +83,13 @@
     init(tensor_shape, num_channels, data_type, fixed_point_position);
 }
 
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+    : TensorInfo()
+{
+    init(tensor_shape, num_channels, data_type, 0);
+    _quantization_info = quantization_info;
+}
+
 TensorInfo::TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height)
     : TensorInfo()
 {
@@ -306,19 +316,26 @@
     return updated;
 }
 
-void TensorInfo::set_data_type(DataType data_type)
+std::unique_ptr<ITensorInfo> TensorInfo::clone() const
+{
+    return support::cpp14::make_unique<TensorInfo>(*this);
+}
+
+ITensorInfo &TensorInfo::set_data_type(DataType data_type)
 {
     _data_type = data_type;
     _format    = Format::UNKNOWN;
+    return *this;
 }
 
-void TensorInfo::set_num_channels(int num_channels)
+ITensorInfo &TensorInfo::set_num_channels(int num_channels)
 {
     _num_channels = num_channels;
     _format       = Format::UNKNOWN;
+    return *this;
 }
 
-void TensorInfo::set_format(Format format)
+ITensorInfo &TensorInfo::set_format(Format format)
 {
     _format = format;
 
@@ -332,9 +349,10 @@
         ARM_COMPUTE_ERROR_ON(num_channels_from_format(format) != _num_channels);
         ARM_COMPUTE_ERROR_ON(data_type_from_format(format) != _data_type);
     }
+    return *this;
 }
 
-void TensorInfo::set_tensor_shape(TensorShape shape)
+ITensorInfo &TensorInfo::set_tensor_shape(TensorShape shape)
 {
     _tensor_shape                  = shape;
     _offset_first_element_in_bytes = 0;
@@ -353,13 +371,31 @@
     Coordinates coordinates;
     coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
     _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    return *this;
 }
 
-void TensorInfo::set_fixed_point_position(int fixed_point_position)
+ITensorInfo &TensorInfo::set_fixed_point_position(int fixed_point_position)
 {
     ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS8 && (fixed_point_position < 1 || fixed_point_position > 6));
     ARM_COMPUTE_ERROR_ON(_data_type == DataType::QS16 && (fixed_point_position < 1 || fixed_point_position > 14));
     _fixed_point_position = fixed_point_position;
+    return *this;
+}
+
+ITensorInfo &TensorInfo::set_quantization_info(QuantizationInfo quantization_info)
+{
+    _quantization_info = quantization_info;
+    return *this;
+}
+
+ITensorInfo &TensorInfo::reset_padding()
+{
+    _padding = PaddingSize();
+    if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+    {
+        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+    }
+    return *this;
 }
 
 size_t TensorInfo::offset_element_in_bytes(const Coordinates &pos) const

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 99d3956..76d0b0f 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -21,10 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/Utils.h"
 
 #include "arm_compute/core/FixedPoint.h"
 
+#include "support/ToolchainSupport.h"
+
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -247,41 +250,80 @@
     return res;
 }
 
+TensorShape arm_compute::deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights)
+{
+    TensorShape out_shape(input);
+    out_shape.set(0, out_dims.first);
+    out_shape.set(1, out_dims.second);
+    out_shape.set(2, weights[3]);
+    return out_shape;
+}
+
+const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
+    unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
+    unsigned int ax, unsigned int ay, float upscalex, float upscaley, DimensionRoundingType round)
+{
+    ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
+    ARM_COMPUTE_ERROR_ON(((in_width - 1) * upscalex + kernel_width + ax) < 2.f * padx);
+    ARM_COMPUTE_ERROR_ON(((in_height - 1) * upscaley + kernel_height + ay) < 2.f * pady);
+    const float fw = (in_width - 1) * upscalex - 2.f * padx + kernel_width + ax;
+    const float fh = (in_height - 1) * upscaley - 2.f * pady + kernel_height + ay;
+    int         w  = 0;
+    int         h  = 0;
+    switch(round)
+    {
+        case DimensionRoundingType::FLOOR:
+            w = std::floor(fw);
+            h = std::floor(fh);
+            break;
+        case DimensionRoundingType::CEIL:
+            w = std::ceil(fw);
+            h = std::ceil(fh);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+    }
+    return std::make_pair<unsigned int, unsigned int>(w, h);
+}
+
 const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
                                                                            unsigned int kernel_width, unsigned int kernel_height,
                                                                            const PadStrideInfo &pad_stride_info)
 {
-    const unsigned int pad_x    = pad_stride_info.pad().first;
-    const unsigned int pad_y    = pad_stride_info.pad().second;
-    const unsigned int stride_x = pad_stride_info.stride().first;
-    const unsigned int stride_y = pad_stride_info.stride().second;
-    unsigned int       w        = 0;
-    unsigned int       h        = 0;
+    const unsigned int pad_left   = pad_stride_info.pad_left();
+    const unsigned int pad_top    = pad_stride_info.pad_top();
+    const unsigned int pad_right  = pad_stride_info.pad_right();
+    const unsigned int pad_bottom = pad_stride_info.pad_bottom();
+    const unsigned int stride_x   = pad_stride_info.stride().first;
+    const unsigned int stride_y   = pad_stride_info.stride().second;
+    unsigned int       w          = 0;
+    unsigned int       h          = 0;
     switch(pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + 2 * pad_x - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + 2 * pad_y - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
     }
 
     // Make sure that border operations will start from inside the input and not the padded area
-    if(((w - 1) * stride_x) >= (width + pad_x))
+    if(((w - 1) * stride_x) >= (width + pad_left))
     {
         --w;
     }
-    if(((h - 1) * stride_y) >= (height + pad_y))
+    if(((h - 1) * stride_y) >= (height + pad_top))
     {
         --h;
     }
-    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_x));
-    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_y));
+    ARM_COMPUTE_ERROR_ON(((w - 1) * stride_x) >= (width + pad_left));
+    ARM_COMPUTE_ERROR_ON(((h - 1) * stride_y) >= (height + pad_top));
 
     return std::make_pair(w, h);
 }
@@ -314,6 +356,7 @@
             print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
             break;
         case DataType::F16:
+            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
             break;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -341,8 +384,9 @@
         case DataType::F32:
             return max_consecutive_elements_display_width_impl<float>(s, reinterpret_cast<const float *>(ptr), n);
         case DataType::F16:
-            return 0;
+            return max_consecutive_elements_display_width_impl<half>(s, reinterpret_cast<const half *>(ptr), n);
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
     }
+    return 0;
 }

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 084a325..f495e48 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -23,108 +23,88 @@
  */
 #include "arm_compute/core/Validate.h"
 
-void arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                               const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
+                                                              const arm_compute::Window &full, const arm_compute::Window &win)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-
     full.validate();
     win.validate();
 
     for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != win[i].step(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].step() != win[i].step(), function, file, line);
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                             const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
+                                                            const arm_compute::Window &full, const arm_compute::Window &sub)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-
     full.validate();
     sub.validate();
 
     for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC(full[i].step() != sub[i].step(), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC((sub[i].start() - full[i].start()) % sub[i].step(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].step() != sub[i].step(), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC((sub[i].start() - full[i].start()) % sub[i].step(), function, file, line);
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                               const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
+                                                                              const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(dim);
-
     full.validate();
     window.validate();
 
-    ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != 0, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC(window[dim].start() != full[dim].start(), function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC(full[dim].end() != window[dim].end(), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(window[dim].start() != 0, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(window[dim].start() != full[dim].start(), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[dim].end() != window[dim].end(), function, file, line);
+
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                      const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
+                                                                     const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(pos);
-
     for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(pos[i] != 0, function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                 const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
+                                                                const arm_compute::Window &win, unsigned int max_dim)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(win);
-
     for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
-                                     function, file, line,
-                                     "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(win[i].start() != 0 || win[i].end() != win[i].step(),
+                                            function, file, line,
+                                            "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                         const arm_compute::ITensor *tensor)
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                                                        const arm_compute::ITensor *tensor)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(tensor);
-
-    ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor->info()->num_dimensions() != 2,
-                                 function, file, line,
-                                 "Only 2D Tensors are supported by this kernel (%d passed)", tensor->info()->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor->info()->num_dimensions() != 2,
+                                        function, file, line,
+                                        "Only 2D Tensors are supported by this kernel (%d passed)", tensor->info()->num_dimensions());
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                       arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
+                                                                      arm_compute::Format fmt, arm_compute::Channel cn)
 {
-    ARM_COMPUTE_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
 
     switch(fmt)
     {
@@ -148,84 +128,68 @@
         default:
             ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_invalid_multi_hog(const char *function, const char *file, const int line,
-                                             const arm_compute::IMultiHOG *multi_hog)
+arm_compute::Status arm_compute::error_on_invalid_multi_hog(const char *function, const char *file, const int line,
+                                                            const arm_compute::IMultiHOG *multi_hog)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-
-    ARM_COMPUTE_ERROR_ON_LOC(nullptr == multi_hog, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC(0 == multi_hog->num_models(), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(nullptr == multi_hog, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(0 == multi_hog->num_models(), function, file, line);
 
     for(size_t i = 1; i < multi_hog->num_models(); ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->phase_type() != multi_hog->model(i)->info()->phase_type(),
-                                     function, file, line,
-                                     "All HOG parameters must have the same phase type");
-        ARM_COMPUTE_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->normalization_type() != multi_hog->model(i)->info()->normalization_type(),
-                                     function, file, line,
-                                     "All HOG parameters must have the same normalization type");
-        ARM_COMPUTE_ERROR_ON_LOC_MSG((multi_hog->model(0)->info()->l2_hyst_threshold() != multi_hog->model(i)->info()->l2_hyst_threshold())
-                                     && (multi_hog->model(0)->info()->normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
-                                     function, file, line,
-                                     "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->phase_type() != multi_hog->model(i)->info()->phase_type(),
+                                            function, file, line,
+                                            "All HOG parameters must have the same phase type");
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(multi_hog->model(0)->info()->normalization_type() != multi_hog->model(i)->info()->normalization_type(),
+                                            function, file, line,
+                                            "All HOG parameters must have the same normalization type");
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((multi_hog->model(0)->info()->l2_hyst_threshold() != multi_hog->model(i)->info()->l2_hyst_threshold())
+                                            && (multi_hog->model(0)->info()->normalization_type() == arm_compute::HOGNormType::L2HYS_NORM),
+                                            function, file, line,
+                                            "All HOG parameters must have the same l2 hysteresis threshold if you use L2 hysteresis normalization type");
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                               const arm_compute::IKernel *kernel)
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+                                                              const arm_compute::IKernel *kernel)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(kernel);
-
-    ARM_COMPUTE_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0) && (kernel->window().x().step() == 0),
-                                 function, file, line,
-                                 "This kernel hasn't been configured.");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((kernel->window().x().start() == kernel->window().x().end()) && (kernel->window().x().end() == 0) && (kernel->window().x().step() == 0),
+                                        function, file, line,
+                                        "This kernel hasn't been configured.");
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
+                                                            const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(parent_shape);
-    ARM_COMPUTE_UNUSED(coords);
-    ARM_COMPUTE_UNUSED(shape);
-
     // Subtensor should not index in x, y dimensions.
-    ARM_COMPUTE_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) && (coords.y() != 0)), function, file, line);
     // Subtensor shape should match parent tensor in x, y dimensions.
-    ARM_COMPUTE_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) && (parent_shape.y() != parent_shape.y())), function, file, line);
 
     // Check dimensions
     for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
-                                 function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
+                                        function, file, line);
     }
+    return arm_compute::Status{};
 }
 
-void arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                          const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
+                                                                         const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
 {
-    ARM_COMPUTE_UNUSED(function);
-    ARM_COMPUTE_UNUSED(file);
-    ARM_COMPUTE_UNUSED(line);
-    ARM_COMPUTE_UNUSED(parent_valid_region);
-    ARM_COMPUTE_UNUSED(valid_region);
-
     // Check valid regions
     for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
-        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
-        ARM_COMPUTE_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
-                                 function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+                                        function, file, line);
     }
+
+    return arm_compute::Status{};
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/utils/io/FileHandler.cpp
similarity index 61%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/utils/io/FileHandler.cpp
index 37857b6..70bce42 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/utils/io/FileHandler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,46 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include <string>
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/utils/io/FileHandler.h"
+
+#include "arm_compute/core/Error.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
+using namespace arm_compute::io;
 
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+FileHandler::FileHandler()
+    : _filestream(), _filename(" "), _mode()
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
+}
+
+FileHandler::~FileHandler()
+{
+    close();
+}
+
+void FileHandler::open(const std::string &filename, std::ios_base::openmode mode)
+{
+    close();
+    ;
+    _filestream.open(filename, mode);
+    ARM_COMPUTE_ERROR_ON(!_filestream.good());
+    _filename = filename;
+    _mode     = mode;
+}
+
+void FileHandler::close()
+{
+    _filestream.close();
+}
+
+std::fstream &FileHandler::stream()
+{
+    return _filestream;
+}
+
+std::string FileHandler::filename() const
+{
+    return _filename;
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/utils/logging/FilePrinter.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/utils/logging/FilePrinter.cpp
index 37857b6..b699afc 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/core/utils/logging/FilePrinter.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
+using namespace arm_compute::logging;
 
-#include <utility>
-
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+FilePrinter::FilePrinter(const std::string &filename)
+    : _handler()
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
+    _handler.open(filename, std::fstream::out | std::fstream::trunc);
 }
+
+void FilePrinter::print_internal(const std::string &msg)
+{
+    _handler.stream() << msg << std::endl;
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/core/utils/logging/Helpers.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/core/utils/logging/Helpers.cpp
index 37857b6..f5ab608 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/core/utils/logging/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/core/utils/logging/Helpers.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
+#include <map>
+#include <string>
 
-#include <utility>
+using namespace arm_compute::logging;
 
-using namespace arm_compute;
-
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
-}
+    static std::map<LogLevel, const std::string> log_level_map =
+    {
+        { LogLevel::VERBOSE, "VERBOSE" },
+        { LogLevel::INFO, "INFO" },
+        { LogLevel::WARN, "WARN" },
+        { LogLevel::OFF, "OFF" },
+    };
+
+    return log_level_map[log_level];
+}
\ No newline at end of file

diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
new file mode 100644
index 0000000..b025ca8
--- /dev/null
+++ b/src/core/utils/logging/Logger.cpp

@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/logging/Logger.h"
+
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::logging;
+
+Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
+    : _name(std::move(name)), _log_level(log_level), _printers(
+{
+    std::move(printer)
+}), _decorators()
+{
+    // Check printer
+    ARM_COMPUTE_ERROR_ON(printer == nullptr);
+
+    // Set default message decorators
+    set_default_decorators();
+}
+
+Logger::Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+    : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
+{
+    // Check printers
+    for(const auto &p : _printers)
+    {
+        ARM_COMPUTE_UNUSED(p);
+        ARM_COMPUTE_ERROR_ON(p == nullptr);
+    }
+    // Set default message decorators
+    set_default_decorators();
+}
+
+Logger::Logger(std::string                              name,
+               LogLevel                                 log_level,
+               std::vector<std::shared_ptr<Printer>>    printers,
+               std::vector<std::unique_ptr<IDecorator>> decorators)
+    : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
+{
+    // Check printers
+    for(const auto &p : _printers)
+    {
+        ARM_COMPUTE_UNUSED(p);
+        ARM_COMPUTE_ERROR_ON(p == nullptr);
+    }
+    // Check decorators
+    for(const auto &d : _decorators)
+    {
+        ARM_COMPUTE_UNUSED(d);
+        ARM_COMPUTE_ERROR_ON(d == nullptr);
+    }
+}
+
+void Logger::log(LogLevel log_level, const std::string &msg)
+{
+    // Return if message shouldn't be logged
+    // i.e. if log level does not match the logger's
+    if(!is_loggable(log_level))
+    {
+        return;
+    }
+
+    // Print message to all printers
+    print_all(create_log_msg(msg, log_level));
+}
+
+void Logger::set_log_level(LogLevel log_level)
+{
+    _log_level = log_level;
+}
+
+LogLevel Logger::log_level() const
+{
+    return _log_level;
+}
+
+std::string Logger::name() const
+{
+    return _name;
+}
+
+void Logger::add_printer(std::shared_ptr<Printer> printer)
+{
+    ARM_COMPUTE_ERROR_ON(printer == nullptr);
+    _printers.push_back(std::move(printer));
+}
+
+void Logger::add_decorator(std::unique_ptr<IDecorator> decorator)
+{
+    ARM_COMPUTE_ERROR_ON(decorator == nullptr);
+    _decorators.push_back(std::move(decorator));
+}
+
+void Logger::set_default_decorators()
+{
+    _decorators.emplace_back(support::cpp14::make_unique<StringDecorator>(_name));
+    _decorators.emplace_back(support::cpp14::make_unique<DateDecorator>());
+    _decorators.emplace_back(support::cpp14::make_unique<LogLevelDecorator>());
+}
+
+bool Logger::is_loggable(LogLevel log_level)
+{
+    return (log_level >= _log_level);
+}
+
+void Logger::decorate_log_msg(LogMsg &msg)
+{
+    for(const auto &d : _decorators)
+    {
+        d->decorate(msg);
+    }
+    msg.raw_ += std::string(" ");
+}
+
+std::string Logger::create_log_msg(const std::string &str, LogLevel log_level)
+{
+    // Adding space string to avoid Android failures
+    LogMsg log_msg(" ", log_level);
+    decorate_log_msg(log_msg);
+    std::ostringstream ss;
+    ss << log_msg.raw_ << " " << str;
+    return ss.str();
+}
+
+void Logger::print_all(const std::string &msg)
+{
+    for(auto &p : _printers)
+    {
+        p->print(msg);
+    }
+}
\ No newline at end of file

diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
new file mode 100644
index 0000000..99236d2
--- /dev/null
+++ b/src/core/utils/logging/LoggerRegistry.cpp

@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/logging/LoggerRegistry.h"
+
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::logging;
+
+/** Reserved logger used by the library */
+std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+
+LoggerRegistry::LoggerRegistry()
+    : _mtx(), _loggers()
+{
+}
+
+LoggerRegistry &LoggerRegistry::get()
+{
+    static LoggerRegistry _instance;
+    return _instance;
+}
+
+void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+    {
+        _loggers[name] = std::make_shared<Logger>(name, log_level, std::move(printers));
+    }
+}
+
+void LoggerRegistry::remove_logger(const std::string &name)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    if(_loggers.find(name) != _loggers.end())
+    {
+        _loggers.erase(name);
+    }
+}
+
+std::shared_ptr<Logger> LoggerRegistry::logger(const std::string &name)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    return (_loggers.find(name) != _loggers.end()) ? _loggers[name] : nullptr;
+}
+
+void LoggerRegistry::create_reserved_loggers(LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers)
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    for(const auto &r : _reserved_loggers)
+    {
+        if(_loggers.find(r) == _loggers.end())
+        {
+            _loggers[r] = std::make_shared<Logger>(r, log_level, printers);
+        }
+    }
+}

diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
new file mode 100644
index 0000000..8bb6d8e
--- /dev/null
+++ b/src/core/utils/quantization/AsymmHelpers.cpp

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
+#include <cmath>
+#include <limits>
+#include <numeric>
+
+using namespace arm_compute::quantization;
+
+constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(double multiplier,
+                                                                                            int   *quant_multiplier,
+                                                                                            int   *right_shift)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier >= 1);
+    if(multiplier == 0)
+    {
+        *quant_multiplier = 0;
+        *right_shift      = 0;
+        return arm_compute::Status{};
+    }
+    const double q = std::frexp(multiplier, right_shift);
+    *right_shift *= -1;
+    auto q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
+    if(q_fixed == fixed_point_one_Q0)
+    {
+        q_fixed /= 2;
+        --*right_shift;
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(*right_shift < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > std::numeric_limits<int32_t>::max());
+    *quant_multiplier = static_cast<int32_t>(q_fixed);
+
+    return arm_compute::Status{};
+}
+
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(double multiplier,
+                                                                                               int   *quantized_multiplier,
+                                                                                               int   *left_shift)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 1.f);
+    const double q       = std::frexp(multiplier, left_shift);
+    auto         q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
+    if(q_fixed == fixed_point_one_Q0)
+    {
+        q_fixed /= 2;
+        ++*left_shift;
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(*left_shift < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+
+    return arm_compute::Status{};
+}

diff --git a/src/graph/CL/CLMap.cpp b/src/graph/CL/CLMap.cpp
index 4892b96..5289ea9 100644
--- a/src/graph/CL/CLMap.cpp
+++ b/src/graph/CL/CLMap.cpp

@@ -23,20 +23,21 @@
  */
 #include "arm_compute/graph/CL/CLMap.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute::graph;
 
-CLMap::CLMap(Tensor *tensor, bool blocking)
-    : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor())), _blocking(blocking)
+CLMap::CLMap(ITensorObject *tensor, bool blocking)
+    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor())), _blocking(blocking)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
 }
 
 void CLMap::run()
 {
-    _tensor->map(_blocking);
+    _tensor->map(arm_compute::CLScheduler::get().queue(), _blocking);
 }

diff --git a/src/graph/CL/CLUnmap.cpp b/src/graph/CL/CLUnmap.cpp
index ec7d865..31f2f19 100644
--- a/src/graph/CL/CLUnmap.cpp
+++ b/src/graph/CL/CLUnmap.cpp

@@ -23,20 +23,21 @@
  */
 #include "arm_compute/graph/CL/CLUnmap.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute::graph;
 
-CLUnmap::CLUnmap(Tensor *tensor)
-    : _tensor(dynamic_cast<arm_compute::CLTensor *>(tensor->tensor()))
+CLUnmap::CLUnmap(ITensorObject *tensor)
+    : _tensor(dynamic_cast<arm_compute::ICLTensor *>(tensor->tensor()))
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(_tensor);
 }
 
 void CLUnmap::run()
 {
-    _tensor->unmap();
+    _tensor->unmap(arm_compute::CLScheduler::get().queue());
 }

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 7dddb1c..ac5316f 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp

@@ -26,16 +26,19 @@
 #include "arm_compute/graph/CL/CLMap.h"
 #include "arm_compute/graph/CL/CLUnmap.h"
 #include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "support/ToolchainSupport.h"
 
 using namespace arm_compute::graph;
 
 struct Stage
 {
-    Tensor                                 *_input;
-    Tensor                                 *_output;
+    ITensorObject                          *_input;
+    ITensorObject                          *_output;
     std::unique_ptr<arm_compute::IFunction> _function;
 };
 
@@ -48,20 +51,21 @@
      */
     void configure(GraphHints _next_hints);
 
-    GraphContext                         _ctx{};
-    std::vector<Stage>                   _pipeline{};
-    std::vector<std::unique_ptr<Tensor>> _tensors{};
-    std::vector<std::unique_ptr<INode>>  _nodes{};
-    GraphHints                           _current_hints{};
-    GraphHints                           _next_hints{};
-    std::unique_ptr<Tensor>              _graph_input{ nullptr };
-    std::unique_ptr<Tensor>              _graph_output{ nullptr };
-    std::unique_ptr<INode>               _current_node{ nullptr };
-    Tensor                              *_current_output{ nullptr };
+    GraphContext                                _ctx{};
+    std::vector<Stage>                          _pipeline{};
+    std::vector<std::unique_ptr<ITensorObject>> _tensors{};
+    std::vector<std::unique_ptr<INode>>         _nodes{};
+    GraphHints                                  _current_hints{};
+    GraphHints                                  _next_hints{};
+    std::unique_ptr<ITensorObject>              _graph_input{ nullptr };
+    std::unique_ptr<ITensorObject>              _graph_output{ nullptr };
+    std::unique_ptr<INode>                      _current_node{ nullptr };
+    ITensorObject                              *_current_output{ nullptr };
+    bool                                        _info_enabled{ false };
 
 private:
-    Tensor    *_current_input{ nullptr };
-    GraphHints _previous_hints{};
+    ITensorObject *_current_input{ nullptr };
+    GraphHints     _previous_hints{};
 };
 
 Graph::~Graph() //NOLINT
@@ -72,13 +76,18 @@
 Graph::Graph()
     : _pimpl{ new Private() }
 {
+    // Check if OpenCL is available and initialize the scheduler
+    if(opencl_is_available())
+    {
+        arm_compute::CLScheduler::get().default_init();
+    }
 }
 
 void Graph::run()
 {
     while(true)
     {
-        if(!_pimpl->_graph_input->call_accessor())
+        if(_pimpl->_graph_input->has_accessor() && !_pimpl->_graph_input->call_accessor())
         {
             return;
         }
@@ -88,7 +97,8 @@
             stage._function->run();
         }
 
-        if(!_pimpl->_graph_output->call_accessor())
+        if((_pimpl->_graph_output->has_accessor() && !_pimpl->_graph_output->call_accessor())
+           || (!_pimpl->_graph_output->has_accessor()))
         {
             return;
         }
@@ -126,9 +136,11 @@
         _current_output->set_target(TargetHint::NEON);
     }
 
-    // Update ctx and instantiate node
+    // Instantiate Node
     _ctx.hints()                                 = _current_hints;
-    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input->tensor(), _current_output->tensor());
+    std::unique_ptr<arm_compute::IFunction> func = _current_node->instantiate_node(_ctx, _current_input, _current_output);
+
+    // Allocate current input
     _current_input->allocate();
 
     // Map input if needed
@@ -181,7 +193,7 @@
 }
 
 //Add a tensor with an Accessor (i.e either the input or output of the graph)
-void Graph::add_tensor(std::unique_ptr<Tensor> tensor)
+void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
 {
     // If it's the first Tensor added then it will be the input of the Graph.
     if(_pimpl->_graph_input == nullptr)
@@ -203,6 +215,10 @@
         _pimpl->_graph_output->allocate();
     }
 }
+bool Graph::opencl_is_available()
+{
+    return arm_compute::opencl_is_available();
+}
 
 void Graph::set_temp(TensorInfo &&tmp)
 {
@@ -227,7 +243,13 @@
 
 Graph &arm_compute::graph::operator<<(Graph &graph, Tensor &&tensor)
 {
-    graph.add_tensor(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
+    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
+    return graph;
+}
+
+Graph &arm_compute::graph::operator<<(Graph &graph, SubTensor &&sub_tensor)
+{
+    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
     return graph;
 }
 

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 4b383f5..582f936 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -26,8 +26,6 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Validate.h"
 
-#include <ostream>
-
 using namespace arm_compute::graph;
 
 TargetHint INode::override_target_hint(TargetHint target_hint) const

diff --git a/src/graph/NodeContext.cpp b/src/graph/NodeContext.cpp
new file mode 100644
index 0000000..2aa5aa1
--- /dev/null
+++ b/src/graph/NodeContext.cpp

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/NodeContext.h"
+
+using namespace arm_compute::graph;
+
+void NodeContext::set_target(TargetHint target)
+{
+    _target = target;
+}
+
+void NodeContext::add_input(arm_compute::ITensor *input)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    _inputs.emplace_back(input);
+}
+
+void NodeContext::add_output(arm_compute::ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+    _outputs.emplace_back(output);
+}
+
+OperationType NodeContext::operation() const
+{
+    return _operation;
+}
+
+TargetHint NodeContext::target() const
+{
+    return _target;
+}
+
+arm_compute::ITensor *NodeContext::input(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx >= _inputs.size());
+    return _inputs[idx];
+}
+
+arm_compute::ITensor *NodeContext::output(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+    return _outputs[idx];
+}
+
+size_t NodeContext::num_inputs() const
+{
+    return _inputs.size();
+}
+
+size_t NodeContext::num_outputs() const
+{
+    return _outputs.size();
+}
\ No newline at end of file

diff --git a/src/graph/OperationRegistry.cpp b/src/graph/OperationRegistry.cpp
new file mode 100644
index 0000000..651653f
--- /dev/null
+++ b/src/graph/OperationRegistry.cpp

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/OperationRegistry.h"
+
+using namespace arm_compute::graph;
+
+OperationRegistry::OperationRegistry()
+    : _registered_ops()
+{
+}
+
+OperationRegistry &OperationRegistry::get()
+{
+    static OperationRegistry instance;
+    return instance;
+}
+
+IOperation *OperationRegistry::find_operation(OperationType operation, TargetHint target)
+{
+    ARM_COMPUTE_ERROR_ON(!contains(operation, target));
+    auto it = std::find_if(_registered_ops[operation].begin(), _registered_ops[operation].end(), [&](const std::unique_ptr<IOperation> &op)
+    {
+        return (op->target() == target);
+    });
+    ARM_COMPUTE_ERROR_ON(it == _registered_ops[operation].end());
+    return (*it).get();
+}
+
+bool OperationRegistry::contains(OperationType operation, TargetHint target) const
+{
+    auto it = _registered_ops.find(operation);
+    if(it != _registered_ops.end())
+    {
+        return std::any_of(it->second.begin(), it->second.end(), [&](const std::unique_ptr<IOperation> &op)
+        {
+            return (op->target() == target);
+        });
+    }
+    return false;
+}

diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
new file mode 100644
index 0000000..e975421
--- /dev/null
+++ b/src/graph/SubGraph.cpp

@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/SubGraph.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Tensor.h"
+
+using namespace arm_compute::graph;
+
+SubGraph::SubGraph()
+    : _nodes(), _input(nullptr), _output(nullptr)
+{
+}
+
+void SubGraph::add_node(std::unique_ptr<INode> node)
+{
+    _nodes.push_back(std::move(node));
+}
+
+void SubGraph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
+{
+    // If it's the first Tensor added then it will be the input of the Graph.
+    if(_input == nullptr)
+    {
+        _input = std::move(tensor);
+    }
+    else
+    {
+        _output = std::move(tensor);
+    }
+}
+
+std::unique_ptr<Graph> SubGraph::construct(TargetHint hint, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output)
+{
+    auto graph = arm_compute::support::cpp14::make_unique<Graph>();
+
+    // Set hint
+    graph->hints().set_target_hint(hint);
+
+    // Configure input
+    if(_input == nullptr)
+    {
+        _input = std::move(input);
+    }
+    graph->add_tensor_object(std::move(_input));
+
+    // Construct nodes
+    for(auto &node : _nodes)
+    {
+        graph->add_node(std::move(node));
+    }
+
+    // Configure output
+    if(_output == nullptr)
+    {
+        _output = std::move(output);
+    }
+    graph->add_tensor_object(std::move(_output));
+
+    return graph;
+}
+
+bool SubGraph::has_input() const
+{
+    return _input != nullptr;
+}
+
+bool SubGraph::has_output() const
+{
+    return _output != nullptr;
+}
+
+SubGraph &arm_compute::graph::operator<<(SubGraph &graph, Tensor &&tensor)
+{
+    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<Tensor>(std::move(tensor)));
+    return graph;
+}
+
+SubGraph &arm_compute::graph::operator<<(SubGraph &graph, SubTensor &&sub_tensor)
+{
+    graph.add_tensor_object(arm_compute::support::cpp14::make_unique<SubTensor>(std::move(sub_tensor)));
+    return graph;
+}

diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
index abf8506..2edeb3b 100644
--- a/src/graph/SubTensor.cpp
+++ b/src/graph/SubTensor.cpp

@@ -27,7 +27,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/Tensor.h"
 #include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
@@ -35,7 +37,7 @@
 namespace
 {
 template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<ITensor> initialise_subtensor(ITensor *parent, TensorShape shape, Coordinates coords)
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords)
 {
     auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
     auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
@@ -44,41 +46,49 @@
 } // namespace
 
 SubTensor::SubTensor()
-    : _target(TargetHint::DONT_CARE), _coords(), _info(), _parent(nullptr), _subtensor(nullptr)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr)
 {
 }
 
 SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords)
-    : _target(TargetHint::DONT_CARE), _coords(coords), _info(), _parent(nullptr), _subtensor(nullptr)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
     _parent = parent.tensor();
-    _info   = SubTensorInfo(parent.tensor()->info(), tensor_shape, coords);
     _target = parent.target();
 
     instantiate_subtensor();
 }
 
-SubTensor::SubTensor(ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
-    : _target(target), _coords(coords), _info(), _parent(parent), _subtensor(nullptr)
+SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
+    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    _info = SubTensorInfo(parent->info(), tensor_shape, coords);
-
     instantiate_subtensor();
 }
 
-void SubTensor::set_info(SubTensorInfo &&info)
+bool SubTensor::call_accessor()
 {
-    _info = info;
+    return true;
 }
 
-const SubTensorInfo &SubTensor::info() const
+bool SubTensor::has_accessor() const
 {
-    return _info;
+    return false;
 }
 
-ITensor *SubTensor::tensor()
+arm_compute::ITensor *SubTensor::set_target(TargetHint target)
+{
+    ARM_COMPUTE_ERROR_ON(target != _target);
+    return (target == _target) ? _subtensor.get() : nullptr;
+}
+
+arm_compute::ITensor *SubTensor::tensor()
+{
+    return _subtensor.get();
+}
+
+const arm_compute::ITensor *SubTensor::tensor() const
 {
     return _subtensor.get();
 }
@@ -88,15 +98,20 @@
     return _target;
 }
 
+void SubTensor::allocate()
+{
+    // NOP for sub-tensors
+}
+
 void SubTensor::instantiate_subtensor()
 {
     switch(_target)
     {
         case TargetHint::OPENCL:
-            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _info.tensor_shape(), _coords);
+            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords);
             break;
         case TargetHint::NEON:
-            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _info.tensor_shape(), _coords);
+            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords);
             break;
         default:
             ARM_COMPUTE_ERROR("Invalid TargetHint");

diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 31dd4e8..4db79e9 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp

@@ -35,7 +35,7 @@
 namespace
 {
 template <typename TensorType>
-std::unique_ptr<ITensor> initialise_tensor(TensorInfo &info)
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(TensorInfo &info)
 {
     auto tensor = arm_compute::support::cpp14::make_unique<TensorType>();
     tensor->allocator()->init(info);
@@ -43,7 +43,7 @@
 }
 
 template <typename TensorType>
-void tensor_allocate(ITensor &tensor)
+void tensor_allocate(arm_compute::ITensor &tensor)
 {
     auto itensor = dynamic_cast<TensorType *>(&tensor);
     ARM_COMPUTE_ERROR_ON_NULLPTR(itensor);
@@ -85,7 +85,17 @@
     return retval;
 }
 
-ITensor *Tensor::tensor()
+bool Tensor::has_accessor() const
+{
+    return (_accessor != nullptr);
+}
+
+arm_compute::ITensor *Tensor::tensor()
+{
+    return _tensor.get();
+}
+
+const arm_compute::ITensor *Tensor::tensor() const
 {
     return _tensor.get();
 }
@@ -95,7 +105,7 @@
     return _info;
 }
 
-ITensor *Tensor::set_target(TargetHint target)
+arm_compute::ITensor *Tensor::set_target(TargetHint target)
 {
     if(_tensor != nullptr)
     {

diff --git a/src/graph/nodes/ActivationLayer.cpp b/src/graph/nodes/ActivationLayer.cpp
index 5cd2a0b..54f30ef 100644
--- a/src/graph/nodes/ActivationLayer.cpp
+++ b/src/graph/nodes/ActivationLayer.cpp

@@ -23,73 +23,33 @@
  */
 #include "arm_compute/graph/nodes/ActivationLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
-{
-template <typename ActivationType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
-{
-    auto activation = arm_compute::support::cpp14::make_unique<ActivationType>();
-    activation->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output),
-        activation_info);
-
-    return std::move(activation);
-}
-
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
-{
-    return instantiate_function<arm_compute::CLActivationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, activation_info);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const ActivationLayerInfo &activation_info)
-{
-    return instantiate_function<arm_compute::NEActivationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, activation_info);
-}
-} // namespace
-
 ActivationLayer::ActivationLayer(const ActivationLayerInfo activation_info)
     : _activation_info(activation_info)
 {
 }
 
-std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> ActivationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output, _activation_info);
-        ARM_COMPUTE_LOG("Instantiating CLActivationLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output, _activation_info);
-        ARM_COMPUTE_LOG("Instantiating NEActivationLayer");
-    }
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << " Activation function: " << _activation_info.activation()
-                    << " a: " << _activation_info.a()
-                    << " b: " << _activation_info.b()
-                    << std::endl);
-    return func;
+    // Create node context
+    NodeContext node_ctx(OperationType::ActivationLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<ActivationLayerInfo>("ActivationLayerInfo", _activation_info);
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::ActivationLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/nodes/BatchNormalizationLayer.cpp b/src/graph/nodes/BatchNormalizationLayer.cpp
index a6a990f..7851aa5 100644
--- a/src/graph/nodes/BatchNormalizationLayer.cpp
+++ b/src/graph/nodes/BatchNormalizationLayer.cpp

@@ -23,88 +23,82 @@
  */
 #include "arm_compute/graph/nodes/BatchNormalizationLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
+std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-template <typename BatchBatchNormalizationLayer, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
-{
-    auto norm = arm_compute::support::cpp14::make_unique<BatchBatchNormalizationLayer>();
-    norm->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output),
-        dynamic_cast<TensorType *>(mean.set_target(target_hint)),
-        dynamic_cast<TensorType *>(var.set_target(target_hint)),
-        dynamic_cast<TensorType *>(beta.set_target(target_hint)),
-        dynamic_cast<TensorType *>(gamma.set_target(target_hint)),
-        epsilon);
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    return std::move(norm);
-}
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
-{
-    return instantiate_function<arm_compute::CLBatchNormalizationLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, mean, var, beta, gamma, epsilon);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, Tensor &mean, Tensor &var, Tensor &beta, Tensor &gamma, float epsilon)
-{
-    return instantiate_function<arm_compute::NEBatchNormalizationLayer, arm_compute::ITensor, TargetHint::NEON>(input, output, mean, var, beta, gamma, epsilon);
-}
-} // namespace
-
-std::unique_ptr<arm_compute::IFunction> BatchNormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
-{
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
-
-    unsigned int batch_norm_size = input->info()->dimension(2);
+    unsigned int batch_norm_size = in->info()->dimension(2);
     if(_mean.tensor() == nullptr)
     {
-        _mean.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _mean.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
     if(_var.tensor() == nullptr)
     {
-        _var.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _var.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
     if(_beta.tensor() == nullptr)
     {
-        _beta.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _beta.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
     if(_gamma.tensor() == nullptr)
     {
-        _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _gamma.set_info(TensorInfo(TensorShape(batch_norm_size), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
 
-    if(_target_hint == TargetHint::OPENCL)
+    bool mean_is_loaded  = _mean.tensor() != nullptr;
+    bool var_is_loaded   = _var.tensor() != nullptr;
+    bool gamma_is_loaded = _gamma.tensor() != nullptr;
+    bool beta_is_loaded  = _beta.tensor() != nullptr;
+
+    // Set mean, var, gamma and beta target
+    _mean.set_target(_target_hint);
+    _var.set_target(_target_hint);
+    _gamma.set_target(_target_hint);
+    _beta.set_target(_target_hint);
+
+    // Create node context
+    NodeContext node_ctx(OperationType::BatchNormalizationLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_input(_mean.tensor());
+    node_ctx.add_input(_var.tensor());
+    node_ctx.add_input(_beta.tensor());
+    node_ctx.add_input(_gamma.tensor());
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<float>("epsilon", _epsilon);
+
+    // Configure operation
+    auto func = OperationRegistry::get().find_operation(OperationType::BatchNormalizationLayer, _target_hint)->configure(node_ctx);
+
+    // Fill tensors
+    if(!mean_is_loaded)
     {
-        func = instantiate<TargetHint::OPENCL>(input, output, _mean, _var, _beta, _gamma, _epsilon);
-        ARM_COMPUTE_LOG("Instantiating CLBatchNormalizationLayer");
+        _mean.allocate_and_fill_if_needed();
     }
-    else
+    if(!var_is_loaded)
     {
-        func = instantiate<TargetHint::NEON>(input, output, _mean, _var, _beta, _gamma, _epsilon);
-        ARM_COMPUTE_LOG("Instantiating NEBatchNormalizationLayer");
+        _var.allocate_and_fill_if_needed();
+    }
+    if(!gamma_is_loaded)
+    {
+        _gamma.allocate_and_fill_if_needed();
+    }
+    if(!beta_is_loaded)
+    {
+        _beta.allocate_and_fill_if_needed();
     }
 
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << std::endl);
-
+    // Get function
     return func;
 }
\ No newline at end of file

diff --git a/src/graph/nodes/BranchLayer.cpp b/src/graph/nodes/BranchLayer.cpp
new file mode 100644
index 0000000..d062e4b
--- /dev/null
+++ b/src/graph/nodes/BranchLayer.cpp

@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BranchLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "support/ToolchainSupport.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace arm_compute::graph;
+
+namespace
+{
+void depth_concatenate_output_info(ITensorInfo *info, ITensorInfo *sub_tensor_info)
+{
+    ARM_COMPUTE_ERROR_ON(info == nullptr);
+    ARM_COMPUTE_ERROR_ON(sub_tensor_info == nullptr);
+
+    TensorShape        info_shape            = info->tensor_shape();
+    const TensorShape &sub_tensor_info_shape = sub_tensor_info->tensor_shape();
+
+    // Update parent info and valid region
+    if(info_shape.total_size() == 0)
+    {
+        arm_compute::auto_init_if_empty(*info,
+                                        sub_tensor_info->tensor_shape(),
+                                        sub_tensor_info->num_channels(),
+                                        sub_tensor_info->data_type(), sub_tensor_info->fixed_point_position());
+        info->set_valid_region(sub_tensor_info->valid_region());
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(info->num_channels() != sub_tensor_info->num_channels());
+        ARM_COMPUTE_ERROR_ON(info->data_type() != sub_tensor_info->data_type());
+        ARM_COMPUTE_ERROR_ON(info->fixed_point_position() != sub_tensor_info->fixed_point_position());
+
+        // Concatenate depth
+        ARM_COMPUTE_ERROR_ON(info_shape.x() != sub_tensor_info_shape.x());
+        ARM_COMPUTE_ERROR_ON(info_shape.y() != sub_tensor_info_shape.y());
+        info_shape.set(2, info_shape.z() + sub_tensor_info_shape.z());
+        info->set_tensor_shape(info_shape);
+
+        // Update valid region
+        arm_compute::ValidRegion info_valid_region = info->valid_region();
+        info_valid_region.shape.set(2, info_shape.z());
+        arm_compute::ValidRegion updated_region = arm_compute::intersect_valid_regions(info_valid_region, sub_tensor_info->valid_region());
+        info->set_valid_region(updated_region);
+    }
+}
+} // namespace
+
+/** Branch function */
+class BranchFunction final : public arm_compute::IFunction
+{
+public:
+    /** Default Constructor */
+    BranchFunction()
+        : _graphs()
+    {
+    }
+    /** Registers graph to be executed by the branch function
+     *
+     * @param[in] graph Graph to register
+     */
+    void register_graph(std::unique_ptr<Graph> graph)
+    {
+        _graphs.push_back(std::move(graph));
+    }
+    // Inherited methods overriden:
+    void run() override
+    {
+        for(auto &g : _graphs)
+        {
+            ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
+            g->run();
+        }
+    }
+
+private:
+    std::vector<std::unique_ptr<Graph>> _graphs;
+};
+
+std::unique_ptr<arm_compute::IFunction> BranchLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON(_branch_merge_method != BranchMergeMethod::DEPTH_CONCATENATE);
+    ARM_COMPUTE_UNUSED(_branch_merge_method);
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    // Create branch function
+    auto func = arm_compute::support::cpp14::make_unique<BranchFunction>();
+
+    // Track output SubTensorInfo and depth
+    TensorInfo out_info;
+    int        depth = 0;
+
+    // Constuct all sub-graphs given the input/output
+    for(auto &sg : _sub_graphs)
+    {
+        ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
+
+        // IO buffers
+        std::unique_ptr<ITensorObject> in;
+        std::unique_ptr<ITensorObject> out;
+        SubTensor                     *out_sub_tensor = nullptr;
+
+        // Create input sub-tensor
+        if(!sg->has_input())
+        {
+            ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
+            in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
+                                                                     input->tensor()->info()->tensor_shape(),
+                                                                     Coordinates());
+        }
+
+        // Create output sub-tensor
+        if(!sg->has_output())
+        {
+            ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
+            out = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(output),
+                                                                      output->tensor()->info()->tensor_shape(),
+                                                                      Coordinates(0, 0, depth));
+            out_sub_tensor = dynamic_cast<SubTensor *>(out.get());
+        }
+
+        // Construct sub_graph
+        auto g = sg->construct(ctx.hints().target_hint(), std::move(in), std::move(out));
+
+        // Register graph to function
+        func->register_graph(std::move(g));
+
+        // Update and track depth
+        if(out_sub_tensor != nullptr)
+        {
+            ARM_COMPUTE_ERROR_ON(out_sub_tensor->tensor() == nullptr);
+            depth += out_sub_tensor->tensor()->info()->tensor_shape()[2];
+            depth_concatenate_output_info(&out_info, out_sub_tensor->tensor()->info());
+        }
+    }
+
+    // Auto-init output
+    arm_compute::auto_init_if_empty(*output->tensor()->info(),
+                                    out_info.tensor_shape(),
+                                    out_info.num_channels(),
+                                    out_info.data_type(),
+                                    out_info.fixed_point_position());
+
+    return std::move(func);
+}
\ No newline at end of file

diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
index b47be8d..ae4a8d7 100644
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ b/src/graph/nodes/ConvolutionLayer.cpp

@@ -23,7 +23,7 @@
  */
 #include "arm_compute/graph/nodes/ConvolutionLayer.h"
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/graph/Error.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -67,7 +67,8 @@
 
 // Instantiate GEMM based convolution layer
 template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                                                             const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
     auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
     conv->configure(
@@ -81,7 +82,8 @@
 
 // Instantiate direct convolution layer
 template <typename ConvolutionType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+std::unique_ptr<arm_compute::IFunction> instantiate_direct_function(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                                                                    const PadStrideInfo &conv_info)
 {
     auto conv = arm_compute::support::cpp14::make_unique<ConvolutionType>();
     conv->configure(
@@ -94,11 +96,13 @@
 }
 
 template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                                                    const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
                                                     ConvolutionMethodHint conv_method);
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                                                                        const PadStrideInfo &conv_info,
                                                                         const WeightsInfo    &weights_info,
                                                                         ConvolutionMethodHint conv_method)
 {
@@ -113,7 +117,8 @@
 }
 
 template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *weights, ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(arm_compute::ITensor *input, arm_compute::ITensor *weights, arm_compute::ITensor *biases, arm_compute::ITensor *output,
+                                                                      const PadStrideInfo &conv_info,
                                                                       const WeightsInfo    &weights_info,
                                                                       ConvolutionMethodHint conv_method)
 {
@@ -169,18 +174,24 @@
     std::vector<std::unique_ptr<IFunction>> _convolutions;
 };
 
-std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> ConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
     // Set weights and biases info
     if(_weights.tensor() == nullptr)
     {
-        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, input->info()->dimension(2) / _num_groups, _ofm),
-                                     input->info()->num_channels(), input->info()->data_type(),
-                                     input->info()->fixed_point_position()));
+        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, in->info()->dimension(2) / _num_groups, _ofm),
+                                     in->info()->num_channels(),
+                                     in->info()->data_type(),
+                                     in->info()->fixed_point_position()));
     }
-    if(_biases.tensor() == nullptr)
+    if(_biases.has_accessor() && _biases.tensor() == nullptr)
     {
-        _biases.set_info(TensorInfo(TensorShape(_ofm), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _biases.set_info(TensorInfo(TensorShape(_ofm), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
 
     std::unique_ptr<arm_compute::IFunction> func;
@@ -189,28 +200,29 @@
 
     // Check if the weights and biases are loaded
     bool weights_are_loaded = _weights.tensor() != nullptr;
-    bool biases_are_loaded  = _weights.tensor() != nullptr;
+    bool biases_are_loaded  = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
 
     // Set bias and weights target
     _weights.set_target(_target_hint);
-    _biases.set_target(_target_hint);
+    if(_biases.has_accessor())
+    {
+        _biases.set_target(_target_hint);
+    }
 
     // Calculate output shape
-    TensorShape output_shape = calculate_convolution_layer_output_shape(input->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
+    TensorShape output_shape = calculate_convolution_layer_output_shape(in->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
 
     // Output auto inizialitation if not yet initialized
-    arm_compute::auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    arm_compute::auto_init_if_empty(*out->info(), output_shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
 
     // Create appropriate convolution function
     if(_num_groups == 1)
     {
-        func = instantiate_convolution(input, output, conv_method_hint);
-        ARM_COMPUTE_LOG("Instantiating CLConvolutionLayer");
+        func = instantiate_convolution(in, out, conv_method_hint);
     }
     else
     {
-        func = instantiate_grouped_convolution(input, output, conv_method_hint);
-        ARM_COMPUTE_LOG("Instantiating NEConvolutionLayer");
+        func = instantiate_grouped_convolution(in, out, conv_method_hint);
     }
 
     // Fill weights
@@ -224,15 +236,15 @@
         _biases.allocate_and_fill_if_needed();
     }
 
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input Shape: " << input->info()->tensor_shape()
-                    << " Weights shape: " << _weights.info().tensor_shape()
-                    << " Biases Shape: " << _biases.info().tensor_shape()
-                    << " Output Shape: " << output->info()->tensor_shape()
-                    << " PadStrideInfo: " << _conv_info
-                    << " Groups: " << _num_groups
-                    << " WeightsInfo: " << _weights_info
-                    << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(" Data Type: " << in->info()->data_type()
+                               << " Input Shape: " << in->info()->tensor_shape()
+                               << " Weights shape: " << _weights.info().tensor_shape()
+                               << " Biases Shape: " << _biases.info().tensor_shape()
+                               << " Output Shape: " << out->info()->tensor_shape()
+                               << " PadStrideInfo: " << _conv_info
+                               << " Groups: " << _num_groups
+                               << " WeightsInfo: " << _weights_info
+                               << std::endl);
 
     return func;
 }
@@ -242,10 +254,12 @@
     std::unique_ptr<arm_compute::IFunction> func;
     if(_target_hint == TargetHint::OPENCL)
     {
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
         func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     else
     {
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
         func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     return func;
@@ -307,10 +321,12 @@
         // Instantiate convolution function
         if(_target_hint == TargetHint::OPENCL)
         {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
             func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
         else
         {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
             func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
 

diff --git a/src/graph/nodes/DeQuantizationLayer.cpp b/src/graph/nodes/DeQuantizationLayer.cpp
new file mode 100644
index 0000000..af9ecee
--- /dev/null
+++ b/src/graph/nodes/DeQuantizationLayer.cpp

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DequantizationLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+
+using namespace arm_compute::graph;
+
+std::unique_ptr<arm_compute::IFunction> DequantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    _target_hint              = ctx.hints().target_hint();
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
+    if(_min_max.tensor() == nullptr)
+    {
+        TensorShape shape = in->info()->tensor_shape();
+        shape.set(Window::DimX, 2);
+        shape.remove_dimension(1);
+        shape.remove_dimension(1);
+
+        _min_max.set_info(TensorInfo(shape, in->info()->num_channels(), DataType::F32));
+        _min_max.set_target(_target_hint);
+    }
+
+    bool minmax_is_loaded = _min_max.tensor() != nullptr;
+
+    // Create node context
+    NodeContext node_ctx(OperationType::DequantizationLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(_min_max.tensor());
+    node_ctx.add_output(out);
+
+    // Fill min max
+    if(!minmax_is_loaded)
+    {
+        _min_max.allocate_and_fill_if_needed();
+    }
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::DequantizationLayer, _target_hint)->configure(node_ctx);
+}

diff --git a/src/graph/nodes/DepthConvertLayer.cpp b/src/graph/nodes/DepthConvertLayer.cpp
new file mode 100644
index 0000000..9b328e7
--- /dev/null
+++ b/src/graph/nodes/DepthConvertLayer.cpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthConvertLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+
+using namespace arm_compute::graph;
+
+DepthConvertLayer::DepthConvertLayer(const ConvertPolicy policy, uint32_t shift, DataType output_datatype)
+    : _policy(policy), _shift(shift), _output_datatype(output_datatype)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> DepthConvertLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    _target_hint              = ctx.hints().target_hint();
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
+    // Auto configure output
+    arm_compute::auto_init_if_empty(*out->info(), in->info()->tensor_shape(), 1, _output_datatype, in->info()->fixed_point_position());
+
+    // Create node context
+    NodeContext node_ctx(OperationType::DepthConvertLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<ConvertPolicy>("ConvertPolicy", _policy);
+    node_ctx.add_parameter<uint32_t>("shift", _shift);
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::DepthConvertLayer, _target_hint)->configure(node_ctx);
+}

diff --git a/src/graph/nodes/DepthwiseConvolutionLayer.cpp b/src/graph/nodes/DepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000..b459853
--- /dev/null
+++ b/src/graph/nodes/DepthwiseConvolutionLayer.cpp

@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DepthwiseConvolutionLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::graph;
+
+std::unique_ptr<arm_compute::IFunction> DepthwiseConvolutionLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
+
+    if(_weights.tensor() == nullptr)
+    {
+        TensorShape shape = in->info()->tensor_shape();
+        shape.set(Window::DimX, _conv_width);
+        shape.set(Window::DimY, _conv_height);
+        _weights.set_info(TensorInfo(TensorShape(shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+    }
+    if(_biases.has_accessor() && _biases.tensor() == nullptr)
+    {
+        _biases.set_info(TensorInfo(TensorShape(in->info()->dimension(2)), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+    }
+
+    bool weights_is_loaded = _weights.tensor() != nullptr;
+    bool biases_is_loaded  = _biases.has_accessor() ? _biases.tensor() != nullptr : true;
+
+    _weights.set_target(_target_hint);
+    if(_biases.has_accessor())
+    {
+        _biases.set_target(_target_hint);
+    }
+
+    // Create node context
+    NodeContext node_ctx(OperationType::DepthwiseConvolutionLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_input(_weights.tensor());
+    if(_biases.has_accessor())
+    {
+        node_ctx.add_input(_biases.tensor());
+    }
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<PadStrideInfo>("ConvolutionInfo", _conv_info);
+    node_ctx.add_parameter<bool>("Optimized3x3", _opt3x3);
+
+    // Configure operation
+    auto func = OperationRegistry::get().find_operation(OperationType::DepthwiseConvolutionLayer, _target_hint)->configure(node_ctx);
+
+    // Fill tensors
+    if(!weights_is_loaded)
+    {
+        _weights.allocate_and_fill_if_needed();
+    }
+    if(!biases_is_loaded)
+    {
+        _biases.allocate_and_fill_if_needed();
+    }
+
+    // Get function
+    return func;
+}

diff --git a/src/graph/nodes/FlattenLayer.cpp b/src/graph/nodes/FlattenLayer.cpp
new file mode 100644
index 0000000..ea08296
--- /dev/null
+++ b/src/graph/nodes/FlattenLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/FlattenLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::graph;
+
+std::unique_ptr<arm_compute::IFunction> FlattenLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    _target_hint              = ctx.hints().target_hint();
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
+    // Auto configure output
+    TensorShape tensor_shape = in->info()->tensor_shape();
+    tensor_shape.collapse(in->info()->num_dimensions());
+    arm_compute::auto_init_if_empty(*out->info(), tensor_shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
+
+    // Create node context
+    NodeContext node_ctx(OperationType::FlattenLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::FlattenLayer, _target_hint)->configure(node_ctx);
+}
\ No newline at end of file

diff --git a/src/graph/nodes/FloorLayer.cpp b/src/graph/nodes/FloorLayer.cpp
index 722cfdf..8750546 100644
--- a/src/graph/nodes/FloorLayer.cpp
+++ b/src/graph/nodes/FloorLayer.cpp

@@ -23,65 +23,27 @@
  */
 #include "arm_compute/graph/nodes/FloorLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLFloor.h"
-#include "arm_compute/runtime/NEON/functions/NEFloor.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
+std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-template <typename FloorType, typename TensorType, TargetHint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
-{
-    auto floorlayer = arm_compute::support::cpp14::make_unique<FloorType>();
-    floorlayer->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output));
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    return std::move(floorlayer);
-}
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+    // Create node context
+    NodeContext node_ctx(OperationType::FloorLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
 
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
-{
-    return instantiate_function<arm_compute::CLFloor, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
-{
-    return instantiate_function<arm_compute::NEFloor, arm_compute::ITensor, TargetHint::NEON>(input, output);
-}
-} // namespace
-
-std::unique_ptr<arm_compute::IFunction> FloorLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
-{
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
-
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output);
-        ARM_COMPUTE_LOG("Instantiating CLFloorLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output);
-        ARM_COMPUTE_LOG("Instantiating NEFloorLayer");
-    }
-
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << std::endl);
-
-    return func;
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::FloorLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 6b21810..219e0f9 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -23,12 +23,10 @@
  */
 #include "arm_compute/graph/nodes/FullyConnectedLayer.h"
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
@@ -44,52 +42,20 @@
     }
     return TensorShape(output_neurons, batches);
 }
-template <typename FullyConnectedType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
-{
-    bool weights_are_loaded = weights.tensor() != nullptr;
-    bool biases_are_loaded  = biases.tensor() != nullptr;
-
-    auto conv = arm_compute::support::cpp14::make_unique<FullyConnectedType>();
-    conv->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(weights.set_target(target_hint)),
-        dynamic_cast<TensorType *>(biases.set_target(target_hint)),
-        dynamic_cast<TensorType *>(output));
-    if(!weights_are_loaded)
-    {
-        weights.allocate_and_fill_if_needed();
-    }
-    if(!biases_are_loaded)
-    {
-        biases.allocate_and_fill_if_needed();
-    }
-
-    return std::move(conv);
-}
-
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
-{
-    return instantiate_function<arm_compute::CLFullyConnectedLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, weights, biases, output);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, Tensor &weights, Tensor &biases, ITensor *output)
-{
-    return instantiate_function<arm_compute::NEFullyConnectedLayer, arm_compute::Tensor, TargetHint::NEON>(input, weights, biases, output);
-}
 } // namespace
 
-std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> FullyConnectedLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
+
     if(_weights.tensor() == nullptr)
     {
         unsigned int num_weights    = 1;
-        unsigned int num_dimensions = input->info()->num_dimensions();
+        unsigned int num_dimensions = in->info()->num_dimensions();
         // Ignore the batch dimension if there is one:
         if(num_dimensions == 2 || num_dimensions == 4)
         {
@@ -97,40 +63,44 @@
         }
         for(unsigned int i = 0; i < num_dimensions; i++)
         {
-            num_weights *= input->info()->dimension(i);
+            num_weights *= in->info()->dimension(i);
         }
-        _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _weights.set_info(TensorInfo(TensorShape(num_weights, _num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
     if(_biases.tensor() == nullptr)
     {
-        _biases.set_info(TensorInfo(TensorShape(_num_neurons), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+        _biases.set_info(TensorInfo(TensorShape(_num_neurons), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
     }
 
     // Auto configure output
-    arm_compute::auto_init_if_empty(*output->info(),
-                                    calculate_fullyconnected_layer_output_shape(input->info()->tensor_shape(), _num_neurons),
-                                    input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    arm_compute::auto_init_if_empty(*out->info(),
+                                    calculate_fullyconnected_layer_output_shape(in->info()->tensor_shape(), _num_neurons),
+                                    in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
 
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
+    bool weights_are_loaded = _weights.tensor() != nullptr;
+    bool biases_are_loaded  = _biases.tensor() != nullptr;
 
-    if(_target_hint == TargetHint::OPENCL)
+    // Create node context
+    NodeContext node_ctx(OperationType::FullyConnectedLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_input(_weights.set_target(_target_hint));
+    node_ctx.add_input(_biases.set_target(_target_hint));
+    node_ctx.add_output(out);
+
+    // Configure operation
+    auto func = OperationRegistry::get().find_operation(OperationType::FullyConnectedLayer, _target_hint)->configure(node_ctx);
+
+    // Fill biases
+    if(!weights_are_loaded)
     {
-        func = instantiate<TargetHint::OPENCL>(input, _weights, _biases, output);
-        ARM_COMPUTE_LOG("Instantiating CLFullyConnectedLayer");
+        _weights.allocate_and_fill_if_needed();
     }
-    else
+    if(!biases_are_loaded)
     {
-        func = instantiate<TargetHint::NEON>(input, _weights, _biases, output);
-        ARM_COMPUTE_LOG("Instantiating NEFullyConnectedLayer");
+        _biases.allocate_and_fill_if_needed();
     }
 
-    ARM_COMPUTE_LOG(" Type: " << input->info()->data_type()
-                    << " Input Shape: " << input->info()->tensor_shape()
-                    << " Weights shape: " << _weights.info().tensor_shape()
-                    << " Biases Shape: " << _biases.info().tensor_shape()
-                    << " Output Shape: " << output->info()->tensor_shape()
-                    << std::endl);
-
+    // Get function
     return func;
 }

diff --git a/src/graph/nodes/L2NormalizeLayer.cpp b/src/graph/nodes/L2NormalizeLayer.cpp
index 46d1552..9813ba4 100644
--- a/src/graph/nodes/L2NormalizeLayer.cpp
+++ b/src/graph/nodes/L2NormalizeLayer.cpp

@@ -23,67 +23,34 @@
  */
 #include "arm_compute/graph/nodes/L2NormalizeLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
+L2NormalizeLayer::L2NormalizeLayer(unsigned int axis, float epsilon)
+    : _axis(axis), _epsilon(epsilon)
 {
-template <typename L2NormalizeType, typename TensorType, TargetHint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
-{
-    auto l2norm = arm_compute::support::cpp14::make_unique<L2NormalizeType>();
-    l2norm->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output),
-        axis,
-        epsilon);
-
-    return std::move(l2norm);
 }
 
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, unsigned int axis, float epsilon);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-    return instantiate_function<arm_compute::CLL2Normalize, arm_compute::ICLTensor, TargetHint::OPENCL>(input, output, axis, epsilon);
-}
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
-{
-    return instantiate_function<arm_compute::NEL2Normalize, arm_compute::ITensor, TargetHint::NEON>(input, output, axis, epsilon);
-}
-} // namespace
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-std::unique_ptr<arm_compute::IFunction> L2NormalizeLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
-{
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
+    // Create node context
+    NodeContext node_ctx(OperationType::L2NormalizeLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<unsigned int>("axis", _axis);
+    node_ctx.add_parameter<float>("epsilon", _epsilon);
 
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output, _axis, _epsilon);
-        ARM_COMPUTE_LOG("Instantiating CLL2NormalizeLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output, _axis, _epsilon);
-        ARM_COMPUTE_LOG("Instantiating NEL2NormalizeLayer");
-    }
-
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << std::endl);
-
-    return func;
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::L2NormalizeLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/nodes/NormalizationLayer.cpp b/src/graph/nodes/NormalizationLayer.cpp
index 47f0891..a489329 100644
--- a/src/graph/nodes/NormalizationLayer.cpp
+++ b/src/graph/nodes/NormalizationLayer.cpp

@@ -23,72 +23,33 @@
  */
 #include "arm_compute/graph/nodes/NormalizationLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
-{
-template <typename NormalizationType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
-{
-    auto norm = arm_compute::support::cpp14::make_unique<NormalizationType>();
-    norm->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output),
-        norm_info);
-
-    return std::move(norm);
-}
-
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
-{
-    return instantiate_function<arm_compute::CLNormalizationLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, norm_info);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
-{
-    return instantiate_function<arm_compute::NENormalizationLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, norm_info);
-}
-} // namespace
-
 NormalizationLayer::NormalizationLayer(const NormalizationLayerInfo norm_info)
     : _norm_info(norm_info)
 {
 }
 
-std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> NormalizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output, _norm_info);
-        ARM_COMPUTE_LOG("Instantiating CLNormalizationLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output, _norm_info);
-        ARM_COMPUTE_LOG("Instantiating NENormalizationLayer");
-    }
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << " Normalization info: " << _norm_info
-                    << std::endl);
+    // Create node context
+    NodeContext node_ctx(OperationType::NormalizationLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<NormalizationLayerInfo>("NormalizationLayerInfo", _norm_info);
 
-    return func;
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::NormalizationLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/nodes/PoolingLayer.cpp b/src/graph/nodes/PoolingLayer.cpp
index 317cf4d..2c15119 100644
--- a/src/graph/nodes/PoolingLayer.cpp
+++ b/src/graph/nodes/PoolingLayer.cpp

@@ -23,71 +23,33 @@
  */
 #include "arm_compute/graph/nodes/PoolingLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
-{
-template <typename PoolingType, typename TensorType, TargetHint target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
-{
-    auto pool = arm_compute::support::cpp14::make_unique<PoolingType>();
-    pool->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output),
-        pool_info);
-
-    return std::move(pool);
-}
-
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
-{
-    return instantiate_function<arm_compute::CLPoolingLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output, pool_info);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info)
-{
-    return instantiate_function<arm_compute::NEPoolingLayer, arm_compute::Tensor, TargetHint::NEON>(input, output, pool_info);
-}
-} // namespace
-
 PoolingLayer::PoolingLayer(const PoolingLayerInfo pool_info)
     : _pool_info(pool_info)
 {
 }
 
-std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
+std::unique_ptr<arm_compute::IFunction> PoolingLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output, _pool_info);
-        ARM_COMPUTE_LOG("Instantiating CLPoolingLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output, _pool_info);
-        ARM_COMPUTE_LOG("Instantiating NEPoolingLayer");
-    }
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << " Pooling info: " << _pool_info << std::endl);
+    // Create node context
+    NodeContext node_ctx(OperationType::PoolingLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+    node_ctx.add_parameter<PoolingLayerInfo>("PoolingLayerInfo", _pool_info);
 
-    return func;
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::PoolingLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/nodes/QuantizationLayer.cpp b/src/graph/nodes/QuantizationLayer.cpp
new file mode 100644
index 0000000..c102f47
--- /dev/null
+++ b/src/graph/nodes/QuantizationLayer.cpp

@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/QuantizationLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+
+using namespace arm_compute::graph;
+
+std::unique_ptr<arm_compute::IFunction> QuantizationLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    _target_hint              = ctx.hints().target_hint();
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
+    // Create node context
+    NodeContext node_ctx(OperationType::QuantizationLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::QuantizationLayer, _target_hint)->configure(node_ctx);
+}

diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
new file mode 100644
index 0000000..bbe0739
--- /dev/null
+++ b/src/graph/nodes/ReshapeLayer.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ReshapeLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::graph;
+
+ReshapeLayer::ReshapeLayer(TensorShape shape)
+    : _shape(shape)
+{
+}
+
+std::unique_ptr<arm_compute::IFunction> ReshapeLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+
+    _target_hint              = ctx.hints().target_hint();
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+
+    // Auto configure output
+    arm_compute::auto_init_if_empty(*out->info(), _shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
+
+    // Create node context
+    NodeContext node_ctx(OperationType::ReshapeLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
+
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::ReshapeLayer, _target_hint)->configure(node_ctx);
+}

diff --git a/src/graph/nodes/SoftmaxLayer.cpp b/src/graph/nodes/SoftmaxLayer.cpp
index 8628244..7f2325b 100644
--- a/src/graph/nodes/SoftmaxLayer.cpp
+++ b/src/graph/nodes/SoftmaxLayer.cpp

@@ -23,65 +23,27 @@
  */
 #include "arm_compute/graph/nodes/SoftmaxLayer.h"
 
-#include "arm_compute/core/Logger.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
-#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
 #include "support/ToolchainSupport.h"
-#include "utils/TypePrinter.h"
 
 using namespace arm_compute::graph;
 
-namespace
+std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
 {
-template <typename SoftmaxType, typename TensorType, TargetHint hint>
-std::unique_ptr<arm_compute::IFunction> instantiate_function(ITensor *input, ITensor *output)
-{
-    auto softmax = arm_compute::support::cpp14::make_unique<SoftmaxType>();
-    softmax->configure(
-        dynamic_cast<TensorType *>(input),
-        dynamic_cast<TensorType *>(output));
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
 
-    return std::move(softmax);
-}
+    arm_compute::ITensor *in  = input->tensor();
+    arm_compute::ITensor *out = output->tensor();
+    _target_hint              = ctx.hints().target_hint();
 
-template <TargetHint                    target_hint>
-std::unique_ptr<arm_compute::IFunction> instantiate(ITensor *input, ITensor *output);
+    // Create node context
+    NodeContext node_ctx(OperationType::SoftmaxLayer);
+    node_ctx.set_target(_target_hint);
+    node_ctx.add_input(in);
+    node_ctx.add_output(out);
 
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::OPENCL>(ITensor *input, ITensor *output)
-{
-    return instantiate_function<arm_compute::CLSoftmaxLayer, arm_compute::CLTensor, TargetHint::OPENCL>(input, output);
-}
-
-template <>
-std::unique_ptr<arm_compute::IFunction> instantiate<TargetHint::NEON>(ITensor *input, ITensor *output)
-{
-    return instantiate_function<arm_compute::NESoftmaxLayer, arm_compute::Tensor, TargetHint::NEON>(input, output);
-}
-} // namespace
-
-std::unique_ptr<arm_compute::IFunction> SoftmaxLayer::instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output)
-{
-    std::unique_ptr<arm_compute::IFunction> func;
-    _target_hint = ctx.hints().target_hint();
-
-    if(_target_hint == TargetHint::OPENCL)
-    {
-        func = instantiate<TargetHint::OPENCL>(input, output);
-        ARM_COMPUTE_LOG("Instantiating CLSoftmaxLayer");
-    }
-    else
-    {
-        func = instantiate<TargetHint::NEON>(input, output);
-        ARM_COMPUTE_LOG("Instantiating NESoftmaxLayer");
-    }
-
-    ARM_COMPUTE_LOG(" Data Type: " << input->info()->data_type()
-                    << " Input shape: " << input->info()->tensor_shape()
-                    << " Output shape: " << output->info()->tensor_shape()
-                    << std::endl);
-
-    return func;
+    // Get function
+    return OperationRegistry::get().find_operation(OperationType::SoftmaxLayer, _target_hint)->configure(node_ctx);
 }

diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
new file mode 100644
index 0000000..8f2bf23
--- /dev/null
+++ b/src/graph/operations/CLSimpleOperations.cpp

@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/graph/IOperation.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistrar.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/CL/CLFunctions.h"
+#include "support/ToolchainSupport.h"
+#include "utils/GraphTypePrinter.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+
+using namespace arm_compute::graph;
+
+/* Activation Layer */
+REGISTER_SIMPLE_OPERATION(CLActivationLayerOperation, OPENCL, OperationType::ActivationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in       = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *out      = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
+
+    // Create and configure function
+    auto activation = arm_compute::support::cpp14::make_unique<arm_compute::CLActivationLayer>();
+    activation->configure(in, out, act_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLActivationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
+                               << std::endl);
+
+    return std::move(activation);
+}
+
+/* Batch Normalization Layer */
+REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationType::BatchNormalizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *mean    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto      *var     = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
+    auto      *beta    = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(3));
+    auto      *gamma   = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(4));
+    auto      *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto epsilon = ctx.parameter<float>("epsilon");
+
+    // Create and configure function
+    auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLBatchNormalizationLayer>();
+    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLBatchNormalizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Mean shape: " << mean->info()->tensor_shape()
+                               << " Var shape: " << var->info()->tensor_shape()
+                               << " Beta shape: " << beta->info()->tensor_shape()
+                               << " Gamma shape: " << gamma->info()->tensor_shape()
+                               << " Epsilon: " << epsilon
+                               << std::endl);
+
+    return std::move(batch_norm);
+}
+
+/* DepthConvertLayer Layer */
+REGISTER_SIMPLE_OPERATION(CLDepthConvertLayerOperation, OPENCL, OperationType::DepthConvertLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in          = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *out         = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
+    const auto shift       = ctx.parameter<uint32_t>("shift");
+
+    // Create and configure function
+    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthConvertLayer>();
+    depthconvert->configure(in, out, conv_policy, shift);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthConvertLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " shift: " << shift
+                               << std::endl);
+
+    return std::move(depthconvert);
+}
+
+/* DepthwiseConvolutionLayer Layer */
+REGISTER_SIMPLE_OPERATION(CLDepthwiseConvolutionOperation, OPENCL, OperationType::DepthwiseConvolutionLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *weights   = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) : nullptr;
+    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
+    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
+
+    // Create and configure function
+    std::unique_ptr<arm_compute::IFunction> func;
+    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
+    if(run_3x3_opt)
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
+    else
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDepthwiseConvolutionLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape());
+    if(biases == nullptr)
+    {
+        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
+    }
+
+    return func;
+}
+
+/* DeQuantizationLayer Layer */
+REGISTER_SIMPLE_OPERATION(CLDequantizationLayerOperation, OPENCL, OperationType::DequantizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1)) == nullptr);
+
+    // Extract IO and info
+    auto *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    auto *min_max = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(1));
+
+    // Create and configure function
+    auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::CLDequantizationLayer>();
+    dequantization->configure(in, out, min_max);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDequantizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Min max shape: " << min_max->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(dequantization);
+}
+
+/* Flatten Layer */
+REGISTER_SIMPLE_OPERATION(CLFlattenLayerOperation, OPENCL, OperationType::FlattenLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::CLFlattenLayer>();
+    flatten->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFlattenLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(flatten);
+}
+
+/* Floor Layer */
+REGISTER_SIMPLE_OPERATION(CLFloorLayerOperation, OPENCL, OperationType::FloorLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto floor = arm_compute::support::cpp14::make_unique<arm_compute::CLFloor>();
+    floor->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFloorLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(floor);
+}
+
+/* Fully Connected Layer */
+REGISTER_SIMPLE_OPERATION(CLFullyConnectedLayer, OPENCL, OperationType::FullyConnectedLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *weights = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto *biases  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(2));
+    auto *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto fc = arm_compute::support::cpp14::make_unique<arm_compute::CLFullyConnectedLayer>();
+    fc->configure(in, weights, biases, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLFullyConnectedLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Biases Shape: " << biases->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(fc);
+}
+
+/* L2 Normalize Layer */
+REGISTER_SIMPLE_OPERATION(CLL2NormalizeLayerOperation, OPENCL, OperationType::L2NormalizeLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in      = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *out     = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto axis    = ctx.parameter<unsigned int>("axis");
+    const auto epsilon = ctx.parameter<float>("epsilon");
+
+    // Create and configure function
+    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::CLL2NormalizeLayer>();
+    l2_norm->configure(in, out, axis, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLL2NormalizeLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Axis: " << axis
+                               << " Epsilon: " << epsilon
+                               << std::endl);
+
+    return std::move(l2_norm);
+}
+
+/* Normalization Layer */
+REGISTER_SIMPLE_OPERATION(CLNormalizationLayerOperation, OPENCL, OperationType::NormalizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
+
+    // Create and configure function
+    auto norm = arm_compute::support::cpp14::make_unique<arm_compute::CLNormalizationLayer>();
+    norm->configure(in, out, norm_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLNormalizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Normalization info: " << norm_info
+                               << std::endl);
+
+    return std::move(norm);
+}
+
+/* Pooling Layer */
+REGISTER_SIMPLE_OPERATION(CLPoolingLayerOperation, OPENCL, OperationType::PoolingLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto      *out       = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+    const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
+
+    // Create and configure function
+    auto pool = arm_compute::support::cpp14::make_unique<arm_compute::CLPoolingLayer>();
+    pool->configure(in, out, pool_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLPoolingLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Pooling info: " << pool_info
+                               << std::endl);
+
+    return std::move(pool);
+}
+
+/* Quantization Layer */
+REGISTER_SIMPLE_OPERATION(CLQuantizationLayerOperation, OPENCL, OperationType::QuantizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::CLQuantizationLayer>();
+    quantization->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLQuantizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(quantization);
+}
+
+/* Reshape Layer */
+REGISTER_SIMPLE_OPERATION(CLReshapeLayerOperation, OPENCL, OperationType::ReshapeLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::CLReshapeLayer>();
+    reshape->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLReshapeLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(reshape);
+}
+
+/* Softmax Layer */
+REGISTER_SIMPLE_OPERATION(CLSoftmaxLayerOperation, OPENCL, OperationType::SoftmaxLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto smx = arm_compute::support::cpp14::make_unique<arm_compute::CLSoftmaxLayer>();
+    smx->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLSoftmaxLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(smx);
+}
\ No newline at end of file

diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
new file mode 100644
index 0000000..bb99e8d
--- /dev/null
+++ b/src/graph/operations/NESimpleOperations.cpp

@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/graph/IOperation.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistrar.h"
+#include "arm_compute/graph/Types.h"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "support/ToolchainSupport.h"
+#include "utils/GraphTypePrinter.h"
+#include "utils/TypePrinter.h"
+
+#include <memory>
+
+using namespace arm_compute::graph;
+
+/* Activation Layer */
+REGISTER_SIMPLE_OPERATION(NEActivationLayerOperation, NEON, OperationType::ActivationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in       = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *out      = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto act_info = ctx.parameter<ActivationLayerInfo>("ActivationLayerInfo");
+
+    // Create and configure function
+    auto activation = arm_compute::support::cpp14::make_unique<arm_compute::NEActivationLayer>();
+    activation->configure(in, out, act_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEActivationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Activation function: " << act_info.activation()
+                               << " a: " << act_info.a()
+                               << " b: " << act_info.b()
+                               << std::endl);
+
+    return std::move(activation);
+}
+
+/* Batch Normalization Layer */
+REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationType::BatchNormalizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 5);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(3)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(4)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *mean    = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto      *var     = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
+    auto      *beta    = dynamic_cast<arm_compute::ITensor *>(ctx.input(3));
+    auto      *gamma   = dynamic_cast<arm_compute::ITensor *>(ctx.input(4));
+    auto      *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto epsilon = ctx.parameter<float>("epsilon");
+
+    // Create and configure function
+    auto batch_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEBatchNormalizationLayer>();
+    batch_norm->configure(in, out, mean, var, beta, gamma, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEBatchNormalizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Mean shape: " << mean->info()->tensor_shape()
+                               << " Var shape: " << var->info()->tensor_shape()
+                               << " Beta shape: " << beta->info()->tensor_shape()
+                               << " Gamma shape: " << gamma->info()->tensor_shape()
+                               << " Epsilon: " << epsilon
+                               << std::endl);
+
+    return std::move(batch_norm);
+}
+
+/* DepthConvertLayer Layer */
+REGISTER_SIMPLE_OPERATION(NEDepthConvertLayerOperation, NEON, OperationType::DepthConvertLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in          = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *out         = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto conv_policy = ctx.parameter<ConvertPolicy>("ConvertPolicy");
+    const auto shift       = ctx.parameter<uint32_t>("shift");
+
+    // Create and configure function
+    auto depthconvert = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthConvertLayer>();
+    depthconvert->configure(in, out, conv_policy, shift);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthConvertLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " shift: " << shift
+                               << std::endl);
+
+    return std::move(depthconvert);
+}
+
+/* DepthwiseConvolutionLayer Layer */
+REGISTER_SIMPLE_OPERATION(NEDepthwiseConvolutionOperation, NEON, OperationType::DepthwiseConvolutionLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2 && ctx.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *weights   = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) : nullptr;
+    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
+    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
+
+    // Create and configure function
+    std::unique_ptr<arm_compute::IFunction> func;
+    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
+    if(run_3x3_opt)
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
+    else
+    {
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
+        depwthwise_conv->configure(in, weights, biases, out, conv_info);
+        func = std::move(depwthwise_conv);
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape());
+    if(biases == nullptr)
+    {
+        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: No biases provided" << std::endl);
+    }
+    else
+    {
+        ARM_COMPUTE_LOG_GRAPH_INFO(" Biases shape: " << biases->info()->tensor_shape() << std::endl);
+    }
+
+    return func;
+}
+
+/* DeQuantizationLayer Layer */
+REGISTER_SIMPLE_OPERATION(NEDequantizationLayerOperation, NEON, OperationType::DequantizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 2);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(1)) == nullptr);
+
+    // Extract IO and info
+    auto *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    auto *min_max = dynamic_cast<arm_compute::ITensor *>(ctx.output(1));
+
+    // Create and configure function
+    auto dequantization = arm_compute::support::cpp14::make_unique<arm_compute::NEDequantizationLayer>();
+    dequantization->configure(in, out, min_max);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDequantizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Min max shape: " << min_max->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(dequantization);
+}
+
+/* Flatten Layer */
+REGISTER_SIMPLE_OPERATION(NEFlattenLayerOperation, NEON, OperationType::FlattenLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto flatten = arm_compute::support::cpp14::make_unique<arm_compute::NEFlattenLayer>();
+    flatten->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFlattenLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(flatten);
+}
+
+/* Floor Layer */
+REGISTER_SIMPLE_OPERATION(NEFloorLayerOperation, NEON, OperationType::FloorLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto floor = arm_compute::support::cpp14::make_unique<arm_compute::NEFloor>();
+    floor->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFloorLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(floor);
+}
+
+/* Fully Connected Layer */
+REGISTER_SIMPLE_OPERATION(NEFullyConnectedLayer, NEON, OperationType::FullyConnectedLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 3);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *weights = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto *biases  = dynamic_cast<arm_compute::ITensor *>(ctx.input(2));
+    auto *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto fc = arm_compute::support::cpp14::make_unique<arm_compute::NEFullyConnectedLayer>();
+    fc->configure(in, weights, biases, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEFullyConnectedLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << " Biases Shape: " << biases->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(fc);
+}
+
+/* L2 Normalize Layer */
+REGISTER_SIMPLE_OPERATION(NEL2NormalizeLayerOperation, NEON, OperationType::L2NormalizeLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in      = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *out     = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto axis    = ctx.parameter<unsigned int>("axis");
+    const auto epsilon = ctx.parameter<float>("epsilon");
+
+    // Create and configure function
+    auto l2_norm = arm_compute::support::cpp14::make_unique<arm_compute::NEL2NormalizeLayer>();
+    l2_norm->configure(in, out, axis, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEL2NormalizeLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Axis: " << axis
+                               << " Epsilon: " << epsilon
+                               << std::endl);
+
+    return std::move(l2_norm);
+}
+
+/* Normalization Layer */
+REGISTER_SIMPLE_OPERATION(NENormalizationLayerOperation, NEON, OperationType::NormalizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto norm_info = ctx.parameter<NormalizationLayerInfo>("NormalizationLayerInfo");
+
+    // Create and configure function
+    auto norm = arm_compute::support::cpp14::make_unique<arm_compute::NENormalizationLayer>();
+    norm->configure(in, out, norm_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NENormalizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Normalization info: " << norm_info
+                               << std::endl);
+
+    return std::move(norm);
+}
+
+/* Pooling Layer */
+REGISTER_SIMPLE_OPERATION(NEPoolingLayerOperation, NEON, OperationType::PoolingLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto      *in        = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+    const auto pool_info = ctx.parameter<PoolingLayerInfo>("PoolingLayerInfo");
+
+    // Create and configure function
+    auto pool = arm_compute::support::cpp14::make_unique<arm_compute::NEPoolingLayer>();
+    pool->configure(in, out, pool_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEPoolingLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << " Pooling info: " << pool_info
+                               << std::endl);
+
+    return std::move(pool);
+}
+
+/* Quantization Layer */
+REGISTER_SIMPLE_OPERATION(NEQuantizationLayerOperation, NEON, OperationType::QuantizationLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto quantization = arm_compute::support::cpp14::make_unique<arm_compute::NEQuantizationLayer>();
+    quantization->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEQuantizationLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(quantization);
+}
+
+/* Reshape Layer */
+REGISTER_SIMPLE_OPERATION(NEReshapeLayerOperation, NEON, OperationType::ReshapeLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto reshape = arm_compute::support::cpp14::make_unique<arm_compute::NEReshapeLayer>();
+    reshape->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEReshapeLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(reshape);
+}
+
+/* Softmax Layer */
+REGISTER_SIMPLE_OPERATION(NESoftmaxLayerOperation, NEON, OperationType::SoftmaxLayer)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 1);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in  = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    // Create and configure function
+    auto smx = arm_compute::support::cpp14::make_unique<arm_compute::NESoftmaxLayer>();
+    smx->configure(in, out);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NESoftmaxLayer"
+                               << " Data Type: " << in->info()->data_type()
+                               << " Input shape: " << in->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(smx);
+}
\ No newline at end of file

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 69292b9..3ca5071 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -37,77 +37,16 @@
 using namespace arm_compute;
 
 BlobLifetimeManager::BlobLifetimeManager()
-    : _active_group(nullptr), _active_elements(), _finalized_groups(), _blobs()
+    : _blobs()
 {
 }
 
-void BlobLifetimeManager::register_group(IMemoryGroup *group)
-{
-    if(_active_group == nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(group == nullptr);
-        _active_group = group;
-    }
-}
-
-void BlobLifetimeManager::start_lifetime(void *obj)
-{
-    ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
-    {
-        return obj == e.id;
-    }) != std::end(_active_elements),
-    "Memory object is already registered!");
-
-    // Insert object in groups and mark its finalized state to false
-    _active_elements.emplace_back(obj);
-}
-
-void BlobLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
-{
-    ARM_COMPUTE_ERROR_ON(obj == nullptr);
-
-    // Find object
-    auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
-    {
-        return obj == e.id;
-    });
-    ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
-
-    // Update object fields and mark object as complete
-    it->handle = handle;
-    it->size   = size;
-    it->status = true;
-
-    // Check if all object are finalized and reset active group
-    if(are_all_finalized())
-    {
-        // Update finalized groups
-        _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
-
-        // Update blobs and group mappings
-        update_blobs_and_mappings();
-
-        // Reset state
-        _active_elements.clear();
-        _active_group = nullptr;
-    }
-}
-
 std::unique_ptr<IMemoryPool> BlobLifetimeManager::create_pool(IAllocator *allocator)
 {
     ARM_COMPUTE_ERROR_ON(allocator == nullptr);
     return support::cpp14::make_unique<BlobMemoryPool>(allocator, _blobs);
 }
 
-bool BlobLifetimeManager::are_all_finalized() const
-{
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
-    {
-        return !e.status;
-    });
-}
-
 MappingType BlobLifetimeManager::mapping_type() const
 {
     return MappingType::BLOBS;
@@ -118,7 +57,7 @@
     ARM_COMPUTE_ERROR_ON(!are_all_finalized());
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
-    // Sort active group requirements in descending order
+    // Sort active group requirements in descending order.
     std::sort(std::begin(_active_elements), std::end(_active_elements), [](const Element & a, const Element & b)
     {
         return a.size > b.size;

diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 865f389..7cd5518 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp

@@ -61,7 +61,6 @@
     const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
     TensorShape tensor_shape = _info.tensor_shape();
 
-    // Note: Look-up table used by the OpenVX sample implementation
     const std::array<float, 4> c_orbscale =
     {
         {

diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index b64739a..eaf2ca5 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+#include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -34,3 +35,8 @@
     k->configure(input, output, act_info);
     _kernel = std::move(k);
 }
+
+Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return CLActivationLayerKernel::validate(input, output, act_info);
+}

diff --git a/src/runtime/CL/functions/CLArithmeticAddition.cpp b/src/runtime/CL/functions/CLArithmeticAddition.cpp
index 5ca384d..5c2e582 100644
--- a/src/runtime/CL/functions/CLArithmeticAddition.cpp
+++ b/src/runtime/CL/functions/CLArithmeticAddition.cpp

@@ -36,3 +36,8 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLArithmeticAdditionKernel::validate(input1, input2, output, policy);
+}

diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 651f51a..5fca30c 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp

@@ -36,3 +36,8 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return CLArithmeticSubtractionKernel::validate(input1, input2, output, policy);
+}

diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 68cdaac..58215c3 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp

@@ -42,6 +42,14 @@
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }
 
+Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ITensorInfo *mean, const ITensorInfo *var,
+                                           const ITensorInfo *beta, const ITensorInfo *gamma,
+                                           float epsilon)
+{
+    return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+}
+
 void CLBatchNormalizationLayer::run()
 {
     CLScheduler::get().enqueue(_norm_kernel, true);

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 4b1bfd8..0ed3351 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include <cmath>
@@ -42,19 +43,22 @@
 
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
     {
+        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type()));
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
-    const bool _has_bias = (biases != nullptr);
+    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
+    const unsigned   bias_element  = (append_biases) ? 1 : 0;
+    const ICLTensor *biases_to_use = (append_biases) ? biases : nullptr;
 
     _transpose1xW = transpose1xW;
 
@@ -62,7 +66,7 @@
     {
         // Create tensor to store the reshaped weights
         const unsigned int mat_weights_cols = weights->info()->dimension(3);
-        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
         TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
         const DataType     dt                   = weights->info()->data_type();
         const int          fixed_point_position = weights->info()->fixed_point_position();
@@ -70,13 +74,13 @@
 
         _weights_reshaped.allocator()->init(info_wr);
         _memory_group.manage(&_weights_reshaped);
-        _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
+        _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
         _weights_transposed_kernel.configure(&_weights_reshaped, output);
         _weights_reshaped.allocator()->allocate();
     }
     else
     {
-        _weights_reshape_kernel.configure(weights, biases, output);
+        _weights_reshape_kernel.configure(weights, biases_to_use, output);
     }
 }
 
@@ -95,43 +99,77 @@
 }
 
 CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(),
-      _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+    : _memory_group(memory_manager), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _output_col2im_kernel(),
+      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false),
+      _are_weights_reshaped(false), _is_quantized(false)
 {
 }
 
+void CLConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+{
+    if(_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+
+        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
+        input->info()->set_quantization_info(input_quantization_info);
+        weights->info()->set_quantization_info(weights_quantization_info);
+    }
+    else
+    {
+        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+    }
+}
+
 void CLConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    ARM_COMPUTE_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->info()->data_type()));
+
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     if(biases != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        if(_is_quantized)
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        }
         ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
         ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
     }
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
+    const DataType dt = input->info()->data_type();
 
     // Set the GPU target for matrix multiply
     _mm_kernel.set_target(CLScheduler::get().target());
 
-    _has_bias             = (biases != nullptr);
+    _append_bias          = (biases != nullptr) && (!_is_quantized);
     _are_weights_reshaped = weights_info.are_reshaped();
 
+    const unsigned   bias_element  = (_append_bias) ? 1 : 0;
+    const ICLTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -144,36 +182,44 @@
 
     // Check if its a "fully connected" convolution
     _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+    const bool run_interleaved      = (!_is_fully_connected_convolution && !_is_quantized);
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
 
     // Reshape weights if needed
     if(_are_weights_reshaped)
     {
-        mat_weights_cols                         = weights_info.num_kernels();
-        const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
-        mat_weights_rows                         = (_has_bias ? 1 + quarter_reshaped_cols : quarter_reshaped_cols);
+        if(_is_fully_connected_convolution || _is_quantized)
+        {
+            mat_weights_cols = weights->info()->dimension(0);
+            mat_weights_rows = weights->info()->dimension(1);
+        }
+        else
+        {
+            mat_weights_cols                         = weights_info.num_kernels();
+            const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+            mat_weights_rows                         = quarter_reshaped_cols + bias_element;
+        }
     }
     else
     {
-        if(_is_fully_connected_convolution)
+        if(_is_fully_connected_convolution || _is_quantized)
         {
             // Create tensor to store the reshaped weights
             TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            TensorInfo  info_wr(shape_wr, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wr);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, false /* 1xW transpose */);
+            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
+            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
         }
         else
         {
             // Create tensor to store transposed weights
             const float transpose_width = 16.0f / input->info()->element_size();
             TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            TensorInfo  info_wt(shape_wt, 1, dt, fixed_point_position);
-            _weights_reshaped.allocator()->init(info_wt);
-            _reshape_weights.configure(weights, biases, &_weights_reshaped, true /* 1xW transpose */);
+            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
+            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
         }
+        _weights_reshaped.info()->set_quantization_info(weights->info()->quantization_info());
         weights = &_weights_reshaped;
     }
 
@@ -184,16 +230,20 @@
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
+    im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
+    _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
     _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
-    if(!_is_fully_connected_convolution)
+    if(run_interleaved)
     {
         TensorShape shape_interleaved = shape_im2col;
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
+        interleaved_info.set_quantization_info(input->info()->quantization_info());
+        _input_interleaved_reshaped.allocator()->init(interleaved_info);
         _memory_group.manage(&_input_interleaved_reshaped);
     }
 
@@ -201,27 +251,51 @@
     TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
+    // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+    info_gemm.set_quantization_info(output->info()->quantization_info());
+    _gemm_output.allocator()->init(info_gemm);
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
+    _input_im2col_kernel.set_target(CLScheduler::get().target());
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
 
     // Configure matrix multiply
-    if(_is_fully_connected_convolution)
+    if(run_interleaved)
     {
-        // The matrix A and Matrix B have not been reshaped
-        _mm_kernel.configure(&_input_im2col_reshaped, weights, &_gemm_output, 1.0f, false);
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
+        _input_interleaved_reshaped.allocator()->allocate();
     }
     else
     {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        _mm_kernel.configure(&_input_interleaved_reshaped, weights, &_gemm_output, 1.0f);
-        _input_interleaved_reshaped.allocator()->allocate();
+        configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
     }
     _input_im2col_reshaped.allocator()->allocate();
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
-    _gemm_output.allocator()->allocate();
+
+    // Configure output stage for quantized case
+    if(_is_quantized)
+    {
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _gemmlowp_output_stage.configure(&_gemm_output, biases, &_tmp_output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+        _gemm_output.allocator()->allocate();
+    }
+
+    // Configure Col2Im
+    _output_col2im_kernel.set_target(CLScheduler::get().target());
+    _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
+    if(_is_quantized)
+    {
+        _tmp_output.allocator()->allocate();
+    }
+    else
+    {
+        _gemm_output.allocator()->allocate();
+    }
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
 
@@ -243,15 +317,30 @@
 
     _memory_group.acquire();
 
-    // Run input reshaping
+    // Run im2col
     CLScheduler::get().enqueue(_input_im2col_kernel);
-    if(!_is_fully_connected_convolution)
+
+    if(!_is_fully_connected_convolution && !_is_quantized)
     {
+        // Run interleave4x4
         CLScheduler::get().enqueue(_input_interleave_kernel);
     }
 
     // Runs matrix multiply on reshaped matrices
-    CLScheduler::get().enqueue(_mm_kernel);
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.run();
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel);
+    }
+
+    // Run output stage for quantized case
+    if(_is_quantized)
+    {
+        _gemmlowp_output_stage.run();
+    }
 
     // Reshape output matrix
     CLScheduler::get().enqueue(_output_col2im_kernel, false);

diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
similarity index 89%
rename from src/runtime/CL/functions/CLDepthConcatenate.cpp
rename to src/runtime/CL/functions/CLDepthConcatenateLayer.cpp
index 89e44ca..05b5d54 100644
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/CL/functions/CLDepthConcatenateLayer.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
@@ -33,7 +33,7 @@
 
 using namespace arm_compute;
 
-CLDepthConcatenate::CLDepthConcatenate() // NOLINT
+CLDepthConcatenateLayer::CLDepthConcatenateLayer() // NOLINT
     : _inputs_vector(),
       _concat_kernels_vector(),
       _border_handlers_vector(),
@@ -41,7 +41,7 @@
 {
 }
 
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+void CLDepthConcatenateLayer::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
@@ -49,7 +49,7 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateLayerKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
 
     TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
@@ -66,7 +66,7 @@
     }
 }
 
-void CLDepthConcatenate::run()
+void CLDepthConcatenateLayer::run()
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 

diff --git a/src/runtime/CL/functions/CLDepthConvert.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
similarity index 83%
rename from src/runtime/CL/functions/CLDepthConvert.cpp
rename to src/runtime/CL/functions/CLDepthConvertLayer.cpp
index b64d05b..b448465 100644
--- a/src/runtime/CL/functions/CLDepthConvert.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp

@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void CLDepthConvert::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<CLDepthConvertLayerKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
similarity index 73%
rename from src/runtime/CL/functions/CLDepthwiseConvolution.cpp
rename to src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 22c037f..02273fe 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/PixelValue.h"
@@ -30,37 +30,43 @@
 
 using namespace arm_compute;
 
-CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
+CLDepthwiseConvolutionLayer3x3::CLDepthwiseConvolutionLayer3x3()
     : _kernel(), _border_handler()
 {
 }
 
-void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer3x3::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    _kernel.configure(input, output, weights, conv_info);
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    _kernel.set_target(CLScheduler::get().target());
+    _kernel.configure(input, weights, biases, output, conv_info);
+
+    // Configure border handler
+    PixelValue &&zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+    }
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
 }
 
-void CLDepthwiseConvolution3x3::run()
+void CLDepthwiseConvolutionLayer3x3::run()
 {
     CLScheduler::get().enqueue(_border_handler);
     CLScheduler::get().enqueue(_kernel);
 }
 
-CLDepthwiseConvolution::CLDepthwiseConvolution()
+CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
       _v2mm_output()
 {
 }
 
-void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
@@ -68,36 +74,41 @@
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
+    const bool      has_bias   = (biases != nullptr);
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
 
     // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h;
+    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
     const size_t conv_size  = conv_w * conv_h;
 
+    // Im2Col configuration
     TensorShape shape_im2col = input->info()->tensor_shape();
     shape_im2col.set(0, patch_size);
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_reshaped.allocator()->init(info_im2col);
+    _im2col_kernel.set_target(gpu_target);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
 
+    // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
-    TensorShape       shape_v2mm_out = output->info()->tensor_shape();
+    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+    // GEMV configuration
+    TensorShape shape_v2mm_out = input->info()->tensor_shape();
     shape_v2mm_out.set(0, conv_size * weights_z);
     shape_v2mm_out.set(1, 1);
     shape_v2mm_out.set(2, 1);
-
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
     const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    _input_reshaped.allocator()->init(info_im2col);
-    _weights_reshaped.allocator()->init(info_weights_reshape);
     _v2mm_output.allocator()->init(info_v2mm_out);
-
-    // Configure kernels
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped);
+    _v2mm_kernel.set_target(gpu_target);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
     _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
 
@@ -113,7 +124,7 @@
     _v2mm_output.allocator()->allocate();
 }
 
-void CLDepthwiseConvolution::run()
+void CLDepthwiseConvolutionLayer::run()
 {
     CLScheduler::get().enqueue(_im2col_kernel);
 

diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
index c325b3e..af2c6f0 100644
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp

@@ -35,12 +35,12 @@
 {
 }
 
-void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases,
-                                                     ICLTensor           *output,
+void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICLTensor *depthwise_weights, const ICLTensor *depthwise_biases, ICLTensor *depthwise_out,
+                                                     const ICLTensor *pointwise_weights, const ICLTensor *pointwise_biases, ICLTensor *output,
                                                      const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
 {
-    _depthwise_conv.configure(input, depthwise_out, depthwise_weights, depthwise_conv_info);
-    _pointwise_conv.configure(depthwise_out, pointwise_weights, biases, output, pointwise_conv_info);
+    _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
+    _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
 }
 
 void CLDepthwiseSeparableConvolutionLayer::run()

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 6fafd9c..d6a335c 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
@@ -45,7 +46,17 @@
     _direct_conv_kernel.configure(input, weights, biases, output, conv_info);
 
     // Configure border handler
-    _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    PixelValue &&zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    {
+        zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().offset));
+    }
+    _input_border_handler.configure(input, _direct_conv_kernel.border_size(), BorderMode::CONSTANT, zero_value);
+}
+
+Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    return CLDirectConvolutionLayerKernel::validate(input, weights, biases, output, conv_info, CLScheduler::get().target());
 }
 
 void CLDirectConvolutionLayer::run()

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index ee1558f..7fd81cd 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
@@ -40,34 +41,55 @@
 }
 
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
-      _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
+    : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
+      _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
 {
 }
 
+void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+{
+    if(_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+
+        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+
+        // Configure gemmlowp function
+        _mm_gemmlowp.configure(input, weights, output);
+
+        // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
+        input->info()->set_quantization_info(input_quantization_info);
+        weights->info()->set_quantization_info(weights_quantization_info);
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        _mm_kernel.set_target(CLScheduler::get().target());
+        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+    }
+}
+
 void CLFullyConnectedLayer::configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output)
 {
     ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
-
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, input->info()->dimension(3));
-    shape_im2col.set(2, input->info()->dimension(4));
-    shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    TensorShape shape_im2col = input->info()->tensor_shape();
+    shape_im2col.collapse(3);
+    _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
     _memory_group.manage(&_im2col_output);
     _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
+    configure_mm(&_im2col_output, weights, output, false);
 
     // Allocate the output tensor for im2col once all the configure methods have been called
     _im2col_output.allocator()->allocate();
@@ -78,26 +100,35 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
     // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f, false);
+    configure_mm(input, weights, output, false);
 }
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() != 2);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
 
     _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
     _is_fc_after_conv     = true;
     _accumulate_biases    = false;
+    _is_quantized         = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(biases != nullptr)
+    // Configure gemmlowp output
+    if(_is_quantized)
+    {
+        _gemmlowp_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+    }
+
+    // Configure accumulate biases kernel for non quantized asymmetric types
+    if(biases != nullptr && !_is_quantized)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
 
         _accumulate_biases = true;
 
         // Configure accumulate biases kernel
+        _accumulate_biases_kernel.set_target(CLScheduler::get().target());
         _accumulate_biases_kernel.configure(output, biases);
     }
 
@@ -131,15 +162,26 @@
         _is_fc_after_conv = input->info()->num_dimensions() > 1;
     }
 
+    ICLTensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
     if(_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, output);
+        configure_conv_fc(input, weights_to_use, tmp_output);
     }
     else
     {
         // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, output);
+        configure_fc_fc(input, weights_to_use, tmp_output);
+    }
+
+    // Configure output stage for asymmetric quantized types
+    if(_is_quantized)
+    {
+        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+        int   output_multiplier, output_shift;
+        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+        _gemmlowp_output.allocator()->allocate();
     }
 
     // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
@@ -168,12 +210,26 @@
     }
 
     // Run matrix multiply
-    CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+    if(_is_quantized)
+    {
+        _mm_gemmlowp.run();
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+    }
 
     // Accumulate biases if provided
-    if(_accumulate_biases)
+    if(_is_quantized)
     {
-        CLScheduler::get().enqueue(_accumulate_biases_kernel);
+        _gemmlowp_output_stage.run();
+    }
+    else
+    {
+        if(_accumulate_biases)
+        {
+            CLScheduler::get().enqueue(_accumulate_biases_kernel);
+        }
     }
 
     _memory_group.release();

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index a81d113..ca0228f 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -65,6 +65,9 @@
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
+    // Set the target for the matrix multiply kernel
+    _mm_kernel.set_target(CLScheduler::get().target());
+
     if(_is_interleaved_transposed)
     {
         matrix_a = &_tmp_a;
@@ -95,9 +98,6 @@
 
         // Configure transpose kernel
         _transpose_kernel.configure(b, &_tmp_b);
-
-        // Configure matrix multiply kernel
-        _mm_kernel.set_target(CLScheduler::get().target());
     }
 
     _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);

diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
deleted file mode 100644
index db6d11c..0000000
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
-    // Create shape for interleaved temporary tensor
-    TensorShape shape_tmp_a = a->info()->tensor_shape();
-    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-    shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
-    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-    _tmp_a.allocator()->init(info_a);
-
-    // Create shape for tranposed temporary tensor
-    TensorShape shape_tmp_b = b->info()->tensor_shape();
-    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-    shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
-    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-    _tmp_b.allocator()->init(info_b);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-    _memory_group.manage(&_tmp_b);
-
-    // Configure kernels
-    _interleave_kernel.configure(a, &_tmp_a);
-    _transpose_kernel.configure(b, &_tmp_b);
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
-    // Allocate intermediate buffers
-    _tmp_a.allocator()->allocate();
-    _tmp_b.allocator()->allocate();
-}
-
-void CLGEMMLowp::run()
-{
-    _memory_group.acquire();
-
-    /* Run interleave kernel */
-    CLScheduler::get().enqueue(_interleave_kernel, false);
-
-    /* Run transpose kernel */
-    CLScheduler::get().enqueue(_transpose_kernel, false);
-
-    /* Run matrix multiply kernel */
-    CLScheduler::get().enqueue(_mm_kernel, false);
-
-    _memory_group.release();
-}

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..5c6f5b4
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+      _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _a_offset                    = a->info()->quantization_info().offset;
+    _b_offset                    = b->info()->quantization_info().offset;
+
+    // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+    const ICLTensor *matrix_a = a;
+    const ICLTensor *matrix_b = b;
+
+    if(_is_interleaved_transposed)
+    {
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
+        // Configure interleave kernel
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+    }
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+
+        if(shape_vector_sum_col.num_dimensions() > 1)
+        {
+            shape_vector_sum_col.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        _vector_sum_col.allocator()->init(info_vector_sum_col);
+        _memory_group.manage(&_vector_sum_col);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+        if(a->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_row.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        _vector_sum_row.allocator()->init(info_vector_sum_row);
+        _memory_group.manage(&_vector_sum_row);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
+    }
+
+    // Configure offset contribution kernel
+    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+    // Allocate tensors
+    if(_is_interleaved_transposed)
+    {
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    if(_a_offset != 0)
+    {
+        _vector_sum_col.allocator()->allocate();
+    }
+
+    if(_b_offset != 0)
+    {
+        _vector_sum_row.allocator()->allocate();
+    }
+}
+
+void CLGEMMLowpMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+
+    if(_is_interleaved_transposed)
+    {
+        // Run reshape matrix A
+        CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
+
+        if(_is_first_run || !_reshape_b_only_on_first_run)
+        {
+            // Run reshape matrix B
+            CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+        }
+    }
+
+    // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
+    if(_is_first_run || !_reshape_b_only_on_first_run)
+    {
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if(_a_offset != 0)
+        {
+            CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+        }
+    }
+
+    // Run matrix multiply
+    CLScheduler::get().enqueue(_mm_kernel, false);
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+    }
+
+    // Run offset contribution kernel
+    CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+
+    _memory_group.release();
+
+    _is_first_run = false;
+}

diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000..16d8678
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                    int result_offset_after_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 8436dce..4b32954 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -157,7 +157,7 @@
             _gauss5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED);
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode == BorderMode::UNDEFINED, SamplingPolicy::CENTER);
         }
 
         _tmp.allocate();

diff --git a/src/runtime/CL/functions/CLL2Normalize.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
similarity index 84%
rename from src/runtime/CL/functions/CLL2Normalize.cpp
rename to src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 99be8ca..d1bb65f 100644
--- a/src/runtime/CL/functions/CLL2Normalize.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLL2Normalize.h"
+#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h"
+#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -34,12 +34,12 @@
 
 using namespace arm_compute;
 
-CLL2Normalize::CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+CLL2NormalizeLayer::CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
-void CLL2Normalize::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
+void CLL2NormalizeLayer::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon)
 {
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
@@ -52,7 +52,7 @@
     _sumsq.allocator()->allocate();
 }
 
-void CLL2Normalize::run()
+void CLL2NormalizeLayer::run()
 {
     _memory_group.acquire();
 

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index a395487..7e5278f 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -29,7 +29,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvert.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
 #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "support/ToolchainSupport.h"

diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 68b8c35..9d6ac7a 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp

@@ -30,8 +30,10 @@
 
 using namespace arm_compute;
 
-void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type)
+void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type, bool use_fp16)
 {
+    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-644): Add half float support
+
     auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, output, nullptr, mag_type);
     _kernel = std::move(k);

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index f4bd494..32d8f15 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -37,7 +37,7 @@
 {
 }
 
-void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
 
@@ -48,6 +48,11 @@
     _border_handler.configure(input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
 }
 
+Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+    return CLNormalizationLayerKernel::validate(input, output, norm_info);
+}
+
 void CLNormalizationLayer::run()
 {
     // Run border handler

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 139d466..c78f944 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -37,3 +37,9 @@
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }
+
+Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 2cb7d63..2341633 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -23,19 +23,36 @@
  */
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
     // Configure pooling kernel
     auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
+    k->set_target(CLScheduler::get().target());
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
     // Configure border depending on operation required
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+    // Quantize border in case data type is quantized asymmetric data type
+    uint32_t border_value = 0;
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    {
+        border_value = static_cast<uint32_t>(input->info()->quantization_info().quantize(0.f, RoundingPolicy::TO_NEAREST_UP));
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(border_value));
 }
+
+Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return CLPoolingLayerKernel::validate(input, output, pool_info);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 49b0275..cb68481 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp

@@ -31,10 +31,10 @@
 
 using namespace arm_compute;
 
-void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
+void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
-    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED);
+    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy);
     _kernel = std::move(k);
     _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
 }

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7505a2c..7c96111 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -23,40 +23,59 @@
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
 
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp(), _run_legacy_path(false)
 {
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output)
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
 
     // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+    const TensorInfo input_info    = input->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
+    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    _tmp.allocator()->init(tensor_info_tmp);
 
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
+    TensorShape max_sum_shape = input->info()->tensor_shape();
+    max_sum_shape.set(0, 1);
+    _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
+    _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
+
+    // Set GPU target to kernels
+    _max_shift_exp_sum_kernel.set_target(CLScheduler::get().target());
 
     // Manage intermediate buffers
     _memory_group.manage(&_tmp);
     _memory_group.manage(&_max);
     _memory_group.manage(&_sum);
 
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
-    _norm_kernel.configure(&_tmp, &_sum, output);
+    // Configure kernels
+    _run_legacy_path = is_data_type_quantized_asymmetric(input->info()->data_type());
+    if(_run_legacy_path)
+    {
+        _max_kernel.configure(input, &_max);
+        _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
+    else
+    {
+        _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
+    }
+    _norm_kernel.configure(&_tmp, &_sum, output, beta);
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -64,12 +83,48 @@
     _sum.allocator()->allocate();
 }
 
+Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    // Create intermediate tensor info
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
+    TensorInfo tensor_info_tmp(input->clone()->set_data_type(tmp_data_type));
+
+    TensorShape max_sum_shape = input->tensor_shape();
+    max_sum_shape.set(0, 1);
+    TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape));
+    TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()));
+
+    bool run_legacy_path = is_data_type_quantized_asymmetric(input->data_type());
+    if(run_legacy_path)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxKernel::validate(input, &tensor_info_max));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
+
+    return Status{};
+}
+
 void CLSoftmaxLayer::run()
 {
     _memory_group.acquire();
 
-    CLScheduler::get().enqueue(_max_kernel, false);
-    CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    // Force to use the new fused kernel
+    if(_run_legacy_path)
+    {
+        CLScheduler::get().enqueue(_max_kernel, false);
+        CLScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
+    }
     CLScheduler::get().enqueue(_norm_kernel);
 
     _memory_group.release();

diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index cd19e25..ecb59f7 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp

@@ -35,4 +35,9 @@
     auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
+}
+
+Status CLTranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLTransposeKernel::validate(input, output);
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/CPP/ICPPSimpleFunction.cpp
similarity index 70%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/CPP/ICPPSimpleFunction.cpp
index 37857b6..42a2d22 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/CPP/ICPPSimpleFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
+#include "arm_compute/runtime/Scheduler.h"
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+ICPPSimpleFunction::ICPPSimpleFunction() // NOLINT
+    : _kernel()
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
-    _kernel = std::move(k);
+}
+
+void ICPPSimpleFunction::run()
+{
+    Scheduler::get().schedule(_kernel.get(), Window::DimY);
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/CPP/functions/CPPPermute.cpp
index 37857b6..bafcd2f 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void CPPPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<CPPPermuteKernel>();
+    k->configure(input, output, perm);
     _kernel = std::move(k);
 }
+
+Status CPPPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
+{
+    return CPPPermuteKernel::validate(input, output, perm);
+}

diff --git a/src/core/Logger.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
similarity index 63%
copy from src/core/Logger.cpp
copy to src/runtime/GLES_COMPUTE/GCScheduler.cpp
index 9c3bf26..b2235ea 100644
--- a/src/core/Logger.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp

@@ -22,35 +22,40 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+GCScheduler::GCScheduler() = default;
+
+void GCScheduler::default_init()
 {
+    GCKernelLibrary::get().init("./cs_shaders/");
 }
 
-Logger &Logger::get()
+void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
 {
-    static Logger _instance;
-    return _instance;
+    GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+GCScheduler &GCScheduler::get()
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    static GCScheduler scheduler;
+    return scheduler;
 }
 
-std::ostream &Logger::log_info()
+void GCScheduler::enqueue(IGCKernel &kernel, bool flush)
 {
-    if(_verbosity == LoggerVerbosity::INFO)
+    kernel.run(kernel.window());
+    if(flush)
     {
-        return *_ostream;
+        ARM_COMPUTE_GL_CHECK(glFlush());
     }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file
+}
+
+void GCScheduler::sync()
+{
+    ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
+}

diff --git a/src/core/Logger.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
similarity index 62%
copy from src/core/Logger.cpp
copy to src/runtime/GLES_COMPUTE/GCTensor.cpp
index 9c3bf26..edbd16d 100644
--- a/src/core/Logger.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp

@@ -22,35 +22,56 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+GCTensor::GCTensor()
+    : _allocator()
 {
 }
 
-Logger &Logger::get()
+ITensorAllocator *GCTensor::allocator()
 {
-    static Logger _instance;
-    return _instance;
+    return &_allocator;
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+TensorInfo *GCTensor::info() const
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    return &_allocator.info();
 }
 
-std::ostream &Logger::log_info()
+TensorInfo *GCTensor::info()
 {
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
+    return &_allocator.info();
+}
+
+uint8_t *GCTensor::buffer() const
+{
+    return _allocator.data();
+}
+
+GLuint GCTensor::gc_buffer() const
+{
+    return _allocator.get_gl_ssbo_name();
+}
+
+void GCTensor::map(bool blocking)
+{
+    IGCTensor::map(blocking);
+}
+
+void GCTensor::unmap()
+{
+    IGCTensor::unmap();
+}
+
+uint8_t *GCTensor::do_map(bool blocking)
+{
+    return _allocator.map(blocking);
+}
+
+void GCTensor::do_unmap()
+{
+    _allocator.unmap();
 }
\ No newline at end of file

diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
new file mode 100644
index 0000000..694b34f
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp

@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCTensorAllocator::GCTensorAllocator()
+    : _gl_buffer(), _mapping(nullptr)
+{
+}
+
+uint8_t *GCTensorAllocator::data()
+{
+    return _mapping;
+}
+
+void GCTensorAllocator::allocate()
+{
+    _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    info().set_is_resizable(false);
+}
+
+void GCTensorAllocator::free()
+{
+    _gl_buffer.reset();
+    info().set_is_resizable(true);
+}
+
+uint8_t *GCTensorAllocator::lock()
+{
+    return map(true);
+}
+
+void GCTensorAllocator::unlock()
+{
+    unmap();
+}
+
+GLuint GCTensorAllocator::get_gl_ssbo_name() const
+{
+    return _gl_buffer->_ssbo_name;
+}
+
+uint8_t *GCTensorAllocator::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    ARM_COMPUTE_UNUSED(blocking);
+
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+    void *p  = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(info().total_size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
+    _mapping = reinterpret_cast<uint8_t *>(p);
+
+    return _mapping;
+}
+
+void GCTensorAllocator::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    _mapping = nullptr;
+}
\ No newline at end of file

diff --git a/src/core/Logger.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
similarity index 69%
copy from src/core/Logger.cpp
copy to src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
index 9c3bf26..199ee46 100644
--- a/src/core/Logger.cpp
+++ b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp

@@ -21,36 +21,24 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+IGCSimpleFunction::IGCSimpleFunction() //NOLINT
+    : _kernel(),
+      _border_handler()
 {
 }
 
-Logger &Logger::get()
+void IGCSimpleFunction::run()
 {
-    static Logger _instance;
-    return _instance;
-}
+    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured");
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
-{
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    GCScheduler::get().enqueue(_border_handler, false);
+    GCScheduler::get().sync();
+    GCScheduler::get().enqueue(*_kernel);
 }
-
-std::ostream &Logger::log_info()
-{
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
similarity index 72%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
index 37857b6..781b357 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCAbsoluteDifference::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCAbsoluteDifferenceKernel>();
+    k->configure(input1, input2, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
similarity index 72%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
index 37857b6..8686416 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCActivationLayer::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCActivationLayerKernel>();
+    k->configure(input, output, act_info);
     _kernel = std::move(k);
 }

diff --git a/src/core/Logger.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
old mode 100644
new mode 100755
similarity index 61%
copy from src/core/Logger.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
index 9c3bf26..2e546a6
--- a/src/core/Logger.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp

@@ -22,35 +22,27 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+GCBatchNormalizationLayer::GCBatchNormalizationLayer()
+    : _norm_kernel()
 {
 }
 
-Logger &Logger::get()
+void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon)
 {
-    static Logger _instance;
-    return _instance;
+    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+void GCBatchNormalizationLayer::run()
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    GCScheduler::get().enqueue(_norm_kernel, true);
 }
-
-std::ostream &Logger::log_info()
-{
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
old mode 100644
new mode 100755
similarity index 67%
copy from src/runtime/CL/functions/CLDepthConcatenate.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index 89e44ca..ee0b121
--- a/src/runtime/CL/functions/CLDepthConcatenate.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp

@@ -21,27 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLDepthConcatenate::CLDepthConcatenate() // NOLINT
-    : _inputs_vector(),
-      _concat_kernels_vector(),
+GCDepthConcatenateLayer::GCDepthConcatenateLayer() //NOLINT
+    : _concat_kernels_vector(),
       _border_handlers_vector(),
       _num_inputs(0)
 {
 }
 
-void CLDepthConcatenate::configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output) // NOLINT
+void GCDepthConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output) //NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
@@ -49,13 +47,8 @@
 
     unsigned int depth_offset = 0;
 
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<CLDepthConcatenateKernel[]>(_num_inputs);
-    _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_inputs);
-
-    TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type(), inputs_vector[0]->info()->fixed_point_position());
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<GCDepthConcatenateLayerKernel[]>(_num_inputs);
+    _border_handlers_vector = arm_compute::support::cpp14::make_unique<GCFillBorderKernel[]>(_num_inputs);
 
     for(unsigned int i = 0; i < _num_inputs; i++)
     {
@@ -66,13 +59,11 @@
     }
 }
 
-void CLDepthConcatenate::run()
+void GCDepthConcatenateLayer::run()
 {
-    cl::CommandQueue q = CLScheduler::get().queue();
-
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+        GCScheduler::get().enqueue(_border_handlers_vector[i], false);
+        GCScheduler::get().enqueue(_concat_kernels_vector[i], true);
     }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
new file mode 100644
index 0000000..ae9dd51
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+{
+    int kernel_size = weights->info()->dimension(0);
+
+    if(kernel_size == 1)
+    {
+        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer1x1Kernel>();
+        k->configure(input, weights, biases, output, conv_info);
+        _kernel = std::move(k);
+    }
+    else if(kernel_size == 3)
+    {
+        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer3x3Kernel>();
+        k->configure(input, weights, biases, output, conv_info);
+        _kernel = std::move(k);
+    }
+    else if(kernel_size == 5)
+    {
+        auto k = arm_compute::support::cpp14::make_unique<GCDirectConvolutionLayer5x5Kernel>();
+        k->configure(input, weights, biases, output, conv_info);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("kernel size unsupported!");
+        return;
+    }
+
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}

diff --git a/src/core/Logger.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
similarity index 61%
copy from src/core/Logger.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
index 9c3bf26..032c2fd 100644
--- a/src/core/Logger.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp

@@ -22,35 +22,29 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+GCDropoutLayer::GCDropoutLayer()
+    : _dropout_kernel()
 {
 }
 
-Logger &Logger::get()
+void GCDropoutLayer::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
 {
-    static Logger _instance;
-    return _instance;
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
+
+    // Configure kernel
+    _dropout_kernel.configure(input, mask, output, ratio, forward);
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+void GCDropoutLayer::run()
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    GCScheduler::get().enqueue(_dropout_kernel);
 }
-
-std::ostream &Logger::log_info()
-{
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
-}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
similarity index 70%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
index 37857b6..5c2431f 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
+#include "arm_compute/core/Helpers.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCFillBorder::configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCFillBorderKernel>();
+    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
new file mode 100644
index 0000000..63cb40e
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp

@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCTensor *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>();
+    k->configure(input, output);
+    _kernel = std::move(k);
+}
+
+GCFullyConnectedLayer::GCFullyConnectedLayer()
+    : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true),
+      _accumulate_biases(false)
+{
+}
+
+void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
+
+    const DataType dt = input->info()->data_type();
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for im2col
+    TensorShape shape_im2col;
+    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
+    shape_im2col.set(1, input->info()->dimension(3));
+    shape_im2col.set(2, input->info()->dimension(4));
+    shape_im2col.set(3, input->info()->dimension(5));
+    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
+
+    // Configure im2col kernel
+    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
+
+    // Allocate the output tensor for im2col once all the configure methods have been called
+    _im2col_output.allocator()->allocate();
+}
+
+void GCFullyConnectedLayer::configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
+
+    // Configure matrix multiply kernel
+    _mm_kernel.configure(input, weights, output, 1.0f, false);
+}
+
+void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+
+    _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
+    _is_fc_after_conv     = true;
+    _accumulate_biases    = false;
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+
+        _accumulate_biases = true;
+
+        // Configure accumulate biases kernel
+        _accumulate_biases_kernel.configure(output, biases);
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const IGCTensor *weights_to_use = weights;
+
+    if(!_are_weights_reshaped)
+    {
+        weights_to_use = &_reshape_weights_output;
+
+        // Reshape the weights
+        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
+    }
+
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
+
+    if(is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+                                                                                  input->info()->tensor_shape().cend(),
+                                                                                  output->info()->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        _is_fc_after_conv = input->info()->num_dimensions() > 1;
+    }
+
+    if(_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(input, weights_to_use, output);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(input, weights_to_use, output);
+    }
+
+    // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
+    if(!_are_weights_reshaped)
+    {
+        // Allocate the tensor for the weights reshaped
+        _reshape_weights_output.allocator()->allocate();
+    }
+}
+
+void GCFullyConnectedLayer::run()
+{
+    // Reshape of the weights (happens only once)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights_kernel.run();
+    }
+
+    // Linearize input if it comes from a convolutional layer
+    if(_is_fc_after_conv)
+    {
+        GCScheduler::get().enqueue(_im2col_kernel, false);
+    }
+
+    GCScheduler::get().sync();
+
+    // Run matrix multiply
+    GCScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+
+    // Accumulate biases if provided
+    if(_accumulate_biases)
+    {
+        GCScheduler::get().sync();
+
+        GCScheduler::get().enqueue(_accumulate_biases_kernel);
+    }
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
new file mode 100644
index 0000000..c47a0e7
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+using namespace arm_compute;
+
+GCGEMM::GCGEMM()
+    : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+{
+}
+
+void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+
+    if(c != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
+        ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
+        ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+
+    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
+    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+    const IGCTensor *matrix_a = a;
+    const IGCTensor *matrix_b = b;
+
+    if(_is_interleaved_transposed)
+    {
+        matrix_a = &_tmp_a;
+        matrix_b = &_tmp_b;
+
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
+
+        const unsigned int transpose_w = max_gc_vector_width / data_size_from_type(b->info()->data_type());
+        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
+        _tmp_a.allocator()->init(info_a);
+
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
+        _tmp_b.allocator()->init(info_b);
+
+        // Configure interleave kernel
+        _interleave_kernel.configure(a, &_tmp_a);
+
+        // Configure transpose kernel
+        _transpose_kernel.configure(b, &_tmp_b);
+    }
+
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+
+    if(_is_interleaved_transposed)
+    {
+        // Allocate intermediate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+
+    // Configure matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
+    }
+}
+
+void GCGEMM::run()
+{
+    if(_is_interleaved_transposed)
+    {
+        // Run interleave kernel
+        GCScheduler::get().enqueue(_interleave_kernel, false);
+
+        // Run transpose kernel
+        GCScheduler::get().enqueue(_transpose_kernel, false);
+    }
+
+    // Run matrix multiply kernel
+    GCScheduler::get().enqueue(_mm_kernel, !_run_addition);
+
+    // Run matrix addition kernel
+    if(_run_addition)
+    {
+        GCScheduler::get().enqueue(_ma_kernel);
+    }
+}

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
similarity index 74%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
index 37857b6..44c940e 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCGEMMInterleave4x4::configure(const IGCTensor *input, IGCTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCGEMMInterleave4x4Kernel>();
+    k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
similarity index 71%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
index 37857b6..893fa55 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCGEMMTranspose1xW::configure(const IGCTensor *input, IGCTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCGEMMTranspose1xWKernel>();
+    k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
new file mode 100644
index 0000000..d30ed52
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+using namespace arm_compute;
+
+GCNormalizationLayer::GCNormalizationLayer()
+    : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
+{
+}
+
+void GCNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info)
+{
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+
+    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
+
+    _norm_kernel.configure(input, &_squared_input, output, norm_info);
+    _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
+    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
+    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+    // Allocate intermediate buffers
+    _squared_input.allocator()->allocate();
+}
+
+void GCNormalizationLayer::run()
+{
+    GCScheduler::get().enqueue(_multiply_kernel, false);
+    GCScheduler::get().enqueue(_border_handler, false);
+    GCScheduler::get().enqueue(_norm_kernel, false);
+}

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
old mode 100644
new mode 100755
similarity index 72%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
index 37857b6..0cd87ea
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCPixelWiseMultiplication::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCPixelWiseMultiplicationKernel>();
+    k->configure(input1, input2, output, scale);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
similarity index 61%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index 37857b6..46a60cd 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    // Configure pooling kernel
+    auto k = arm_compute::support::cpp14::make_unique<GCPoolingLayerKernel>();
+    k->configure(input, output, pool_info);
     _kernel = std::move(k);
+
+    // Configure border depending on operation required
+    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
new file mode 100644
index 0000000..34464ff
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+using namespace arm_compute;
+
+GCSoftmaxLayer::GCSoftmaxLayer()
+    : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+{
+}
+
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta)
+{
+    ARM_COMPUTE_UNUSED(beta);
+
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(beta != 1.0f);
+
+    // Create intermediate tensors shapes
+    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()));
+
+    TensorShape shape = input->info()->tensor_shape();
+    shape.set(0, 1);
+    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
+    _max.allocator()->init(tensor_info_max_sum);
+    _sum.allocator()->init(tensor_info_max_sum);
+
+    // Configure Kernels
+    _max_kernel.configure(input, &_max);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _norm_kernel.configure(&_tmp, &_sum, output);
+
+    // Allocate intermediate buffers
+    _tmp.allocator()->allocate();
+    _max.allocator()->allocate();
+    _sum.allocator()->allocate();
+}
+
+void GCSoftmaxLayer::run()
+{
+    GCScheduler::get().enqueue(_max_kernel, false);
+    GCScheduler::get().sync();
+    GCScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    GCScheduler::get().sync();
+    GCScheduler::get().enqueue(_norm_kernel);
+}

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
similarity index 75%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
index 37857b6..c2dc122 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void GCTranspose::configure(const IGCTensor *input, IGCTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<GCTransposeKernel>();
+    k->configure(input, output);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 4292469..0254181 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -131,6 +131,9 @@
 {
     switch(get_cpu_impl())
     {
+        case 0xd0f:
+            _info.CPU = CPUTarget::A55_DOT;
+            break;
         case 0xd03:
             _info.CPU = CPUTarget::A53;
             break;

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
new file mode 100644
index 0000000..2c64475
--- /dev/null
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/ISimpleLifetimeManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <vector>
+
+using namespace arm_compute;
+
+ISimpleLifetimeManager::ISimpleLifetimeManager()
+    : _active_group(nullptr), _active_elements(), _finalized_groups()
+{
+}
+
+void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
+{
+    if(_active_group == nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(group == nullptr);
+        _active_group = group;
+    }
+}
+
+void ISimpleLifetimeManager::start_lifetime(void *obj)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    }) != std::end(_active_elements),
+    "Memory object is already registered!");
+
+    // Insert object in groups and mark its finalized state to false
+    _active_elements.emplace_back(obj);
+}
+
+void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+{
+    ARM_COMPUTE_ERROR_ON(obj == nullptr);
+
+    // Find object
+    auto it = std::find_if(std::begin(_active_elements), std::end(_active_elements), [&obj](const Element & e)
+    {
+        return obj == e.id;
+    });
+    ARM_COMPUTE_ERROR_ON(it == std::end(_active_elements));
+
+    // Update object fields and mark object as complete
+    it->handle = handle;
+    it->size   = size;
+    it->status = true;
+
+    // Check if all object are finalized and reset active group
+    if(are_all_finalized())
+    {
+        // Update finalized groups
+        _finalized_groups[_active_group].insert(std::end(_finalized_groups[_active_group]), std::begin(_active_elements), std::end(_active_elements));
+
+        // Update blobs and group mappings
+        update_blobs_and_mappings();
+
+        // Reset state
+        _active_elements.clear();
+        _active_group = nullptr;
+    }
+}
+
+bool ISimpleLifetimeManager::are_all_finalized() const
+{
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const Element e)
+    {
+        return !e.status;
+    });
+}

diff --git a/src/core/Logger.cpp b/src/runtime/Memory.cpp
similarity index 63%
copy from src/core/Logger.cpp
copy to src/runtime/Memory.cpp
index 9c3bf26..35d0c82 100644
--- a/src/core/Logger.cpp
+++ b/src/runtime/Memory.cpp

@@ -21,36 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/Memory.h"
 
-#include "arm_compute/core/Logger.h"
+#include "arm_compute/core/Error.h"
 
 using namespace arm_compute;
 
-Logger::Logger()
-    : _ostream(&std::cout), _nullstream(nullptr), _verbosity(LoggerVerbosity::NONE)
+Memory::Memory()
+    : _memory(nullptr), _memory_owned(nullptr)
 {
 }
 
-Logger &Logger::get()
+Memory::Memory(std::shared_ptr<uint8_t> memory)
+    : _memory(nullptr), _memory_owned(std::move(memory))
 {
-    static Logger _instance;
-    return _instance;
+    ARM_COMPUTE_ERROR_ON(_memory_owned.get() == nullptr);
+    _memory = _memory_owned.get();
 }
 
-void Logger::set_logger(std::ostream &ostream, LoggerVerbosity verbosity)
+Memory::Memory(uint8_t *memory)
+    : _memory(memory), _memory_owned(nullptr)
 {
-    _ostream   = &ostream;
-    _verbosity = verbosity;
+    ARM_COMPUTE_ERROR_ON(memory == nullptr);
 }
 
-std::ostream &Logger::log_info()
+uint8_t *Memory::buffer()
 {
-    if(_verbosity == LoggerVerbosity::INFO)
-    {
-        return *_ostream;
-    }
-    else
-    {
-        return _nullstream;
-    }
+    return _memory;
+}
+
+uint8_t *Memory::buffer() const
+{
+    return _memory;
+}
+
+uint8_t **Memory::handle()
+{
+    ARM_COMPUTE_ERROR_ON(_memory_owned.get() != nullptr);
+    return &_memory;
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 57a1738..cdf1b54 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -34,3 +34,8 @@
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }
+
+Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return NEActivationLayerKernel::validate(input, output, act_info);
+}

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 11f5aa7..b5dd4d0 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
+}

diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 37586af..5c0491e 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp

@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
 }
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
+}

diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index ef79b02..f6be001 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp

@@ -43,6 +43,12 @@
     _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon);
 }
 
+Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
+                                           float epsilon)
+{
+    return NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon);
+}
+
 void NEBatchNormalizationLayer::run()
 {
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
similarity index 68%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NECol2Im.cpp
index 37857b6..78c6bc0 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NECol2Im::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<NECol2ImKernel>();
+    k->configure(input, output, convolved_dims);
     _kernel = std::move(k);
 }
+
+Status NECol2Im::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
+{
+    return NECol2ImKernel::validate(input, output, convolved_dims);
+}

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index f34f497..25c639f 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -136,10 +136,7 @@
     // Get parameters from conv_info
     unsigned int stride_x = 0;
     unsigned int stride_y = 0;
-    unsigned int pad_x    = 0;
-    unsigned int pad_y    = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
-    std::tie(pad_x, pad_y)       = conv_info.pad();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -190,9 +187,17 @@
     {
         if(_are_weights_reshaped)
         {
-            const unsigned int transpose_width = 16 / input->info()->element_size();
-            mat_weights_cols                   = weights_info.num_kernels();
-            mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+            if(_is_fully_connected_convolution)
+            {
+                mat_weights_cols = weights_info.num_kernels();
+                mat_weights_rows = weights->info()->dimension(1);
+            }
+            else
+            {
+                const unsigned int transpose_width = 16 / input->info()->element_size();
+                mat_weights_cols                   = weights_info.num_kernels();
+                mat_weights_rows                   = weights->info()->dimension(0) / transpose_width + (_has_bias ? 1 : 0);
+            }
         }
         else
         {
@@ -270,7 +275,7 @@
         // Configure matrix multiplication kernel
         if(_is_fully_connected_convolution)
         {
-            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f, false, false);
+            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
         }
         else
         {
@@ -295,7 +300,7 @@
     }
 
     _input_im2col_reshaped.allocator()->allocate();
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
new file mode 100644
index 0000000..7b4e77b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _scaled_output()
+{
+}
+
+void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
+                                     unsigned int ax, unsigned int ay, float upscalex, float upscaley)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                    info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round());
+
+    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
+
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+    ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+
+    _memory_group.manage(&_scaled_output);
+
+    // configure scale function
+    //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+    TensorShape scale_out_shape(input->info()->tensor_shape());
+    scale_out_shape.set(0, output->info()->dimension(0));
+    scale_out_shape.set(1, output->info()->dimension(1));
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _scaled_output.allocator()->init(scale_out_info);
+    const unsigned int kernel_size = weights->info()->dimension(0);
+    // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding
+    ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1));
+    const unsigned int  tr_px     = kernel_size - info.pad().first - 1;
+    const unsigned int  tr_py     = kernel_size - info.pad().second - 1;
+    const unsigned int  tr_stride = 1;
+    const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py);
+    _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info);
+    // setup the function to convolve the upscaled output
+    switch(kernel_size)
+    {
+        case 1:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
+            break;
+        }
+        case 3:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+            break;
+        }
+        case 5:
+        {
+            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported");
+            break;
+        }
+    }
+    _scaled_output.allocator()->allocate();
+}
+
+void NEDeconvolutionLayer::run()
+{
+    _memory_group.acquire();
+    _scale_f.run();
+    _conv_f.run();
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
new file mode 100644
index 0000000..79b9b2d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp

@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair<unsigned int, unsigned int> &a,
+                               const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+    Window    win;
+    const int padx          = info.pad().first;
+    const int pady          = info.pad().second;
+    const int ax            = a.first;
+    const int ay            = a.second;
+    const int offset_width  = offsets->info()->dimension(0);
+    const int offset_height = offsets->info()->dimension(1);
+    // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image.
+    // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1).
+    // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets
+    for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second))
+    {
+        for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first))
+        {
+            int         *ptr                  = reinterpret_cast<int *>(offsets->ptr_to_element(Coordinates(xi, yi)));
+            const size_t in_xi                = (xi + 0.5f) * wr;
+            *reinterpret_cast<int32_t *>(ptr) = in_xi * input_element_size;
+        }
+    }
+}
+} // namespace
+
+NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _offsets(),
+      _border_handler(),
+      _upsample()
+{
+}
+
+void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
+                                             const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON(nullptr == input);
+    ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+    }
+
+    // Get the tensor shape
+    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+    ARM_COMPUTE_UNUSED(hr);
+    // Get the element size of the input image
+    const size_t input_element_size = input->info()->element_size();
+
+    TensorInfo tensor_info_offsets(shape, Format::S32);
+    _offsets.allocator()->init(tensor_info_offsets);
+
+    _upsample.configure(input, &_offsets, output);
+
+    // Allocate once the configure methods have been called
+    _offsets.allocator()->allocate();
+    // Pre-compute offsets for nearest interpolation
+    std::fill_n(reinterpret_cast<int32_t *>(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size);
+    precompute_offsets(&_offsets, wr, input_element_size, a, iz, info);
+
+    _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0.f));
+}
+
+void NEDeconvolutionLayerUpsample::run()
+{
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
+    _memory_group.acquire();
+    NEScheduler::get().schedule(&_upsample, Window::DimY);
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
similarity index 89%
rename from src/runtime/NEON/functions/NEDepthConcatenate.cpp
rename to src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp
index f8ad2ab..437c941 100644
--- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp
+++ b/src/runtime/NEON/functions/NEDepthConcatenateLayer.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +33,7 @@
 
 using namespace arm_compute;
 
-NEDepthConcatenate::NEDepthConcatenate() // NOLINT
+NEDepthConcatenateLayer::NEDepthConcatenateLayer() // NOLINT
     : _inputs_vector(),
       _concat_kernels_vector(),
       _border_handlers_vector(),
@@ -41,12 +41,12 @@
 {
 }
 
-void NEDepthConcatenate::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
+void NEDepthConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output) // NOLINT
 {
     ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
 
     _num_inputs             = inputs_vector.size();
-    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateKernel[]>(_num_inputs);
+    _concat_kernels_vector  = arm_compute::support::cpp14::make_unique<NEDepthConcatenateLayerKernel[]>(_num_inputs);
     _border_handlers_vector = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(_num_inputs);
 
     TensorShape output_shape = calculate_depth_concatenate_shape(inputs_vector);
@@ -64,7 +64,7 @@
     }
 }
 
-void NEDepthConcatenate::run()
+void NEDepthConcatenateLayer::run()
 {
     for(unsigned i = 0; i < _num_inputs; ++i)
     {

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
similarity index 83%
rename from src/runtime/NEON/functions/NEDepthConvert.cpp
rename to src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 37857b6..9a75404 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp

@@ -21,18 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
+    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
similarity index 60%
copy from src/runtime/CL/functions/CLDepthwiseConvolution.cpp
copy to src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 22c037f..b890c6f 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolution.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -21,46 +21,54 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 
-#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-CLDepthwiseConvolution3x3::CLDepthwiseConvolution3x3()
-    : _kernel(), _border_handler()
+NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
+    : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
 {
 }
 
-void CLDepthwiseConvolution3x3::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
 
-    _kernel.configure(input, output, weights, conv_info);
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(0));
+    // Call convolution kernel
+    _kernel.configure(input, weights, output, conv_info);
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+    if(biases != nullptr)
+    {
+        _bias_kernel.configure(output, biases);
+        _has_bias = true;
+    }
 }
 
-void CLDepthwiseConvolution3x3::run()
+void NEDepthwiseConvolutionLayer3x3::run()
 {
-    CLScheduler::get().enqueue(_border_handler);
-    CLScheduler::get().enqueue(_kernel);
+    NEScheduler::get().schedule(&_border_handler, Window::DimX);
+    NEScheduler::get().schedule(&_kernel, Window::DimX);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+    }
 }
 
-CLDepthwiseConvolution::CLDepthwiseConvolution()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(), _weights_reshaped(),
-      _v2mm_output()
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _input_reshaped(), _weights_reshaped(), _v2mm_output()
 {
 }
 
-void CLDepthwiseConvolution::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != weights->info()->dimension(2));
 
@@ -68,60 +76,51 @@
     const size_t weights_h = weights->info()->dimension(1);
     const size_t weights_z = weights->info()->dimension(2);
 
+    bool has_bias = (biases != nullptr);
+
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
     std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights_w, weights_h, conv_info);
 
     // Set up intermediate tensors
-    const size_t patch_size = weights_w * weights_h;
+    const size_t patch_size = weights_w * weights_h + ((has_bias) ? 1 : 0);
     const size_t conv_size  = conv_w * conv_h;
 
+    // Im2Col configuration
     TensorShape shape_im2col = input->info()->tensor_shape();
     shape_im2col.set(0, patch_size);
     shape_im2col.set(1, conv_size);
     shape_im2col.set(2, weights_z);
+    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _input_reshaped.allocator()->init(info_im2col);
+    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, has_bias);
 
+    // Weights reshape configuration
     const TensorShape shape_weights_reshape(patch_size, weights_z);
-    TensorShape       shape_v2mm_out = output->info()->tensor_shape();
+    const TensorInfo  info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+    _weights_reshaped.allocator()->init(info_weights_reshape);
+    _weights_reshape_kernel.configure(weights, &_weights_reshaped, biases);
+
+    // GEMV configuration
+    TensorShape shape_v2mm_out = input->info()->tensor_shape();
     shape_v2mm_out.set(0, conv_size * weights_z);
     shape_v2mm_out.set(1, 1);
     shape_v2mm_out.set(2, 1);
-
-    const TensorInfo info_im2col(shape_im2col, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    const TensorInfo info_weights_reshape(shape_weights_reshape, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
     const TensorInfo info_v2mm_out(shape_v2mm_out, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    _input_reshaped.allocator()->init(info_im2col);
-    _weights_reshaped.allocator()->init(info_weights_reshape);
     _v2mm_output.allocator()->init(info_v2mm_out);
-
-    // Configure kernels
-    _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info);
-    _weights_reshape_kernel.configure(weights, &_weights_reshaped);
     _v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
     _vector_to_tensor_kernel.configure(&_v2mm_output, output, conv_w, conv_h);
 
-    BorderSize border_size = _v2mm_kernel.border_size();
-    _v2mm_input_fill_border.configure(&_input_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
-
-    border_size.bottom = 0;
-    _v2mm_weights_fill_border.configure(&_weights_reshaped, border_size, BorderMode::CONSTANT, PixelValue(0));
-
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
     _weights_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
 }
 
-void CLDepthwiseConvolution::run()
+void NEDepthwiseConvolutionLayer::run()
 {
-    CLScheduler::get().enqueue(_im2col_kernel);
-
-    CLScheduler::get().enqueue(_weights_reshape_kernel);
-
-    CLScheduler::get().enqueue(_v2mm_input_fill_border);
-    CLScheduler::get().enqueue(_v2mm_weights_fill_border);
-    CLScheduler::get().enqueue(_v2mm_kernel);
-
-    CLScheduler::get().enqueue(_vector_to_tensor_kernel);
-}
+    NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
+    NEScheduler::get().schedule(&_vector_to_tensor_kernel, Window::DimX);
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
new file mode 100644
index 0000000..d70a668
--- /dev/null
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+NEDepthwiseSeparableConvolutionLayer::NEDepthwiseSeparableConvolutionLayer()
+    : _depthwise_conv(), _pointwise_conv()
+{
+}
+
+void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out,
+                                                     const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output,
+                                                     const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info)
+{
+    _depthwise_conv.configure(input, depthwise_weights, depthwise_biases, depthwise_out, depthwise_conv_info);
+    _pointwise_conv.configure(depthwise_out, pointwise_weights, pointwise_biases, output, pointwise_conv_info);
+}
+
+void NEDepthwiseSeparableConvolutionLayer::run()
+{
+    _depthwise_conv.run();
+    _pointwise_conv.run();
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 52a4cc1..afa5d97 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
 {
 }
 
@@ -46,38 +46,28 @@
         _accumulator.allocator()->free();
     }
 
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    switch(output->info()->data_type())
+    if(is_data_type_fixed_point(input->info()->data_type()))
     {
-        case DataType::QS8:
+        const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
+        _memory_group.manage(&_accumulator);
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        if(_has_bias)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
             _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
         }
-        case DataType::QS16:
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        if(_has_bias)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
-        }
-        case DataType::F16:
-        case DataType::F32:
-        {
-            _conv_kernel.configure(input, weights, output, conv_info);
             _accumulate_bias_kernel.configure(output, bias);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
         }
     }
 
@@ -85,6 +75,38 @@
     _input_border_handler.configure(input, _conv_kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
 }
 
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+
+    DataType data_type = output->data_type();
+    if(is_data_type_fixed_point(data_type))
+    {
+        // Promote data type in case of fixed point
+        data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
+    }
+    TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
+
+    // Validate Convolution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
+
+    // Validate bias
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
+                                    "Biases should be provided for fixed point inputs");
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+
+        // Validate bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
+    }
+
+    return Status{};
+}
+
 void NEDirectConvolutionLayer::run()
 {
     NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
@@ -92,7 +114,10 @@
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 2e8d105..fc04e28 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -133,7 +133,7 @@
     const int      num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
     const size_t   linear_input_size    = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
 
-    _linearize_input      = input->info()->tensor_shape().x() != linear_input_size;
+    _linearize_input      = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
     _are_weights_reshaped = are_weights_reshaped;
     _accumulate_biases    = biases != nullptr;
     _is_batched_fc_layer  = num_batch_dimensions > 0;

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index ff92ef8..950f4c9 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h"
 #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
@@ -39,6 +40,7 @@
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a32_sgemm_8x6.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp"
 } // namespace arm_compute
 
@@ -96,6 +98,14 @@
         {
             _mm_optimised_kernel = support::cpp14::make_unique<NEGEMMAArch64Kernel>();
         }
+        else if(a->info()->data_type() == DataType::F16 && (c == nullptr || beta == 0.f))
+        {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            _mm_optimised_kernel = support::cpp14::make_unique<NEHGEMMAArch64FP16Kernel>();
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        }
 #endif /* defined(__arm__) || defined(__aarch64__) */
 
 #if defined(__arm__) || defined(__aarch64__)
@@ -107,19 +117,32 @@
             const int N = d->info()->tensor_shape().x();
             const int K = a->info()->tensor_shape().x();
 
+            size_t workbench_size = 0;
+
 #if defined(__arm__)
-            GemmInterleaved<sgemm_8x6, float, float> gemm(&ci, M, N, K, false, false);
+            workbench_size = GemmInterleaved<sgemm_8x6, sgemm_8x6::operand_type, sgemm_8x6::result_type>(&ci, M, N, K, false, false).get_working_size();
 #elif defined(__aarch64__)
-            GemmInterleaved<sgemm_12x8, float, float> gemm(&ci, M, N, K, false, false);
+            if(a->info()->data_type() == DataType::F32)
+            {
+                workbench_size = GemmInterleaved<sgemm_12x8, sgemm_12x8::operand_type, sgemm_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+            }
+            else if(a->info()->data_type() == DataType::F16)
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                workbench_size = GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                ARM_COMPUTE_ERROR("Recompile the library with arch=arm64-v8.2-a to enable support for FP16.");
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            }
 #endif /* defined(__arm__) || defined(__aarch64__) */
 
             constexpr size_t alignment = 4096;
-            _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            ARM_COMPUTE_ERROR_ON_MSG(workbench_size == 0, "size cannot be 0");
+            _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::S8));
             _memory_group.manage(&_workspace);
 
             // Configure matrix multiplication kernel
             _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
-
             _workspace.allocator()->allocate();
         }
         else

diff --git a/src/runtime/NEON/functions/NEGEMMLowp.cpp b/src/runtime/NEON/functions/NEGEMMLowp.cpp
deleted file mode 100644
index 7413b28..0000000
--- a/src/runtime/NEON/functions/NEGEMMLowp.cpp
+++ /dev/null

@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-
-using namespace arm_compute;
-
-NEGEMMLowp::NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowp::configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
-    /* The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] */
-    TensorShape shape_tmp_a = a->info()->tensor_shape();
-    shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-    shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-    TensorShape shape_tmp_b = b->info()->tensor_shape();
-    shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-    shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-    TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-    TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
-    _tmp_a.allocator()->init(info_a);
-    _tmp_b.allocator()->init(info_b);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp_a);
-    _memory_group.manage(&_tmp_b);
-
-    _interleave_kernel.configure(a, &_tmp_a);
-    _transpose_kernel.configure(b, &_tmp_b);
-    _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
-    _tmp_a.allocator()->allocate();
-    _tmp_b.allocator()->allocate();
-}
-
-void NEGEMMLowp::run()
-{
-    _memory_group.acquire();
-
-    /* Run interleave kernel */
-    NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
-
-    /* Run transpose kernel */
-    NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
-    /* Run matrix multiply kernel */
-    NEScheduler::get().schedule(&_mm_kernel, Window::DimY);
-
-    _memory_group.release();
-}

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..6e03ffa
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -0,0 +1,216 @@
+/* Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b(), _workspace()
+{
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+#ifdef __aarch64__
+    const int            M                   = output->info()->tensor_shape().y();
+    const int            N                   = output->info()->tensor_shape().x();
+    const int            K                   = a->info()->tensor_shape().x();
+    constexpr size_t     workspace_alignment = 4096;
+    const struct CPUInfo ci                  = NEScheduler::get().cpu_info();
+#endif /* __aarch64__ */
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    if(ci.CPU == CPUTarget::A75_DOT)
+    {
+        // Configure matrix multiply kernel
+        GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else if(ci.CPU == CPUTarget::A55_DOT)
+    {
+        ARM_COMPUTE_ERROR_ON("WIP");
+    }
+    else
+#elif defined(ARM_COMPUTE_AARCH64_V8A)
+    if(ci.CPU == CPUTarget::A53)
+    {
+        switch(a->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            case DataType::U8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Datatype not supported");
+        }
+
+        _memory_group.manage(&_workspace);
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64A53Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else if(1) // Generic v8a kernel
+    {
+        switch(a->info()->data_type())
+        {
+            case DataType::S8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            case DataType::U8:
+            {
+                // Configure matrix multiply kernel
+                GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&ci, M, N, K, false, false);
+                _workspace.allocator()->init(TensorInfo(TensorShape{ (gemm.get_working_size() + workspace_alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Datatype not supported");
+        }
+        _memory_group.manage(&_workspace);
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+        _workspace.allocator()->allocate();
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->info()->tensor_shape();
+        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->info()->tensor_shape();
+        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        _tmp_a.allocator()->init(info_a);
+        _tmp_b.allocator()->init(info_b);
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
+
+        // Configure interleave kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+            k->configure(a, &_tmp_a);
+            _mtx_a_reshape_kernel = std::move(k);
+        }
+
+        // Configure transpose kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+            k->configure(b, &_tmp_b);
+            _mtx_b_reshape_kernel = std::move(k);
+        }
+
+        // Configure matrix multiply kernel
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+            k->configure(&_tmp_a, &_tmp_b, output);
+            _mm_kernel = std::move(k);
+        }
+
+        // Allocate tensors
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+}
+
+void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+    if(_mtx_a_reshape_kernel)
+    {
+        NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+    }
+
+    if(_mtx_b_reshape_kernel)
+    {
+        NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+    }
+
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000..50aa5b6
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+#include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
+} // namespace arm_compute
+
+using namespace arm_compute;
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
+      _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _dot_product_path(false)
+{
+}
+
+void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+
+    _a_offset                         = a->info()->quantization_info().offset;
+    _b_offset                         = b->info()->quantization_info().offset;
+    _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    // Check for DOT product instruction
+    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
+    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+    if(cpu_has_dotprod != 0)
+    {
+        _dot_product_path = true;
+
+        // Configure matrix multiply kernel
+        struct CPUInfo ci = NEScheduler::get().cpu_info();
+        const int      M  = output->info()->tensor_shape().y();
+        const int      N  = output->info()->tensor_shape().x();
+        const int      K  = a->info()->tensor_shape().x();
+
+        const size_t     workbench_size = GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type>(&ci, M, N, K, false, false).get_working_size();
+        constexpr size_t alignment      = 4096;
+        _workspace.allocator()->init(TensorInfo(TensorShape{ (workbench_size + alignment - 1) * NEScheduler::get().num_threads() }, 1, DataType::U8));
+        _memory_group.manage(&_workspace);
+
+        // Configure matrix multiplication kernel
+        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
+        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        _mm_kernel = std::move(k);
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        if(_run_vector_matrix_multiplication)
+        {
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(a, b, output);
+                _mm_kernel = std::move(k);
+            }
+        }
+        else
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->info()->tensor_shape();
+            shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->info()->tensor_shape();
+            shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+            _tmp_a.allocator()->init(info_a);
+            _tmp_b.allocator()->init(info_b);
+            _memory_group.manage(&_tmp_a);
+            _memory_group.manage(&_tmp_b);
+
+            // Configure interleave kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
+                k->configure(a, &_tmp_a);
+                _mtx_a_reshape_kernel = std::move(k);
+            }
+
+            // Configure transpose kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
+                k->configure(b, &_tmp_b);
+                _mtx_b_reshape_kernel = std::move(k);
+            }
+
+            // Configure matrix multiply kernel
+            {
+                auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
+                k->configure(&_tmp_a, &_tmp_b, output);
+                _mm_kernel = std::move(k);
+            }
+        }
+    }
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+        if(b->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_col.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        _vector_sum_col.allocator()->init(info_vector_sum_col);
+        _memory_group.manage(&_vector_sum_col);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+        if(a->info()->num_dimensions() > 1)
+        {
+            shape_vector_sum_row.remove_dimension(1);
+        }
+        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        _vector_sum_row.allocator()->init(info_vector_sum_row);
+        _memory_group.manage(&_vector_sum_row);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+    }
+
+    // Configure offset contribution kernel
+    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+    // Allocate tensors
+    if(!_dot_product_path && !_run_vector_matrix_multiplication)
+    {
+        _tmp_a.allocator()->allocate();
+        _tmp_b.allocator()->allocate();
+    }
+    else
+    {
+        _workspace.allocator()->allocate();
+    }
+
+    if(_a_offset != 0)
+    {
+        _vector_sum_col.allocator()->allocate();
+    }
+
+    if(_b_offset != 0)
+    {
+        _vector_sum_row.allocator()->allocate();
+    }
+}
+
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+                                    "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+                                    "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    int32_t a_offset                         = a->quantization_info().offset;
+    int32_t b_offset                         = b->quantization_info().offset;
+    bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    // Check for DOT product instruction
+    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
+    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+    if(cpu_has_dotprod != 0)
+    {
+        // Validate matrix multiply kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        if(!run_vector_matrix_multiplication)
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        }
+    }
+
+    TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if(a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->tensor_shape();
+        shape_vector_sum_col.remove_dimension(1);
+        info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->dimension(1));
+        shape_vector_sum_row.remove_dimension(1);
+        info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+    }
+
+    // Validate offset contribution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                             a_offset, b_offset));
+
+    return Status{};
+}
+
+void NEGEMMLowpMatrixMultiplyCore::run()
+{
+    _memory_group.acquire();
+
+    // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
+    if(!_run_vector_matrix_multiplication && !_dot_product_path)
+    {
+        if(_mtx_a_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+        }
+
+        if(_mtx_b_reshape_kernel)
+        {
+            NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+        }
+    }
+
+    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if(_b_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if(_a_offset != 0)
+    {
+        NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+    }
+
+    // Run offset contribution kernel
+    NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
+
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000..8c02436
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
+}
+
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
+                                                                    int result_offset_after_shift, int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
+    k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
+    _kernel = std::move(k);
+}
+
+Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index 84ea0ca..8a85bba 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp

@@ -47,7 +47,8 @@
 }
 
 NEGaussianPyramidHalf::NEGaussianPyramidHalf() // NOLINT
-    : _border_handler(),
+    : _horizontal_border_handler(),
+      _vertical_border_handler(),
       _horizontal_reduction(),
       _vertical_reduction()
 {
@@ -62,6 +63,9 @@
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != pyramid->info()->height());
     ARM_COMPUTE_ERROR_ON(SCALE_PYRAMID_HALF != pyramid->info()->scale());
 
+    // Constant value to use for vertical fill border when the border mode is CONSTANT
+    const uint16_t pixel_value_u16 = static_cast<uint16_t>(constant_border_value) * 2 + static_cast<uint16_t>(constant_border_value) * 8 + static_cast<uint16_t>(constant_border_value) * 6;
+
     /* Get number of pyramid levels */
     const size_t num_levels = pyramid->info()->num_levels();
 
@@ -70,9 +74,10 @@
 
     if(num_levels > 1)
     {
-        _border_handler       = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
-        _horizontal_reduction = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
-        _vertical_reduction   = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
+        _horizontal_border_handler = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _vertical_border_handler   = arm_compute::support::cpp14::make_unique<NEFillBorderKernel[]>(num_levels - 1);
+        _horizontal_reduction      = arm_compute::support::cpp14::make_unique<NEGaussianPyramidHorKernel[]>(num_levels - 1);
+        _vertical_reduction        = arm_compute::support::cpp14::make_unique<NEGaussianPyramidVertKernel[]>(num_levels - 1);
 
         // Apply half scale to the X dimension of the tensor shape
         TensorShape tensor_shape = pyramid->info()->tensor_shape();
@@ -84,13 +89,16 @@
         for(unsigned int i = 0; i < num_levels - 1; ++i)
         {
             /* Configure horizontal kernel */
-            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode == BorderMode::UNDEFINED);
+            _horizontal_reduction[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i));
 
             /* Configure vertical kernel */
-            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), border_mode == BorderMode::UNDEFINED);
+            _vertical_reduction[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1));
 
             /* Configure border */
-            _border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+            _horizontal_border_handler[i].configure(_pyramid->get_pyramid_level(i), _horizontal_reduction[i].border_size(), border_mode, PixelValue(constant_border_value));
+
+            /* Configure border */
+            _vertical_border_handler[i].configure(_tmp.get_pyramid_level(i), _vertical_reduction[i].border_size(), border_mode, PixelValue(pixel_value_u16));
         }
 
         _tmp.allocate();
@@ -109,8 +117,9 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
+        NEScheduler::get().schedule(_horizontal_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
+        NEScheduler::get().schedule(_vertical_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
 }

diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
similarity index 64%
copy from src/runtime/NEON/functions/NEDepthConvert.cpp
copy to src/runtime/NEON/functions/NEIm2Col.cpp
index 37857b6..8e90e66 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2017 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
+#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "support/ToolchainSupport.h"
 
-#include <utility>
-
 using namespace arm_compute;
 
-void NEDepthConvert::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
-    k->configure(input, output, policy, shift);
+    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
+    k->configure(input, output, kernel_dims, conv_info, has_bias);
     _kernel = std::move(k);
 }
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias);
+}

diff --git a/src/runtime/NEON/functions/NEL2Normalize.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
similarity index 86%
rename from src/runtime/NEON/functions/NEL2Normalize.cpp
rename to src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 349a781..fa62483 100644
--- a/src/runtime/NEON/functions/NEL2Normalize.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -21,19 +21,19 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h"
+#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 
-NEL2Normalize::NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager)
+NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _reduce_func(), _normalize_kernel(), _sumsq()
 {
 }
 
-void NEL2Normalize::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
+void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, unsigned int axis, float epsilon)
 {
     // Manage intermediate buffers
     _memory_group.manage(&_sumsq);
@@ -46,7 +46,7 @@
     _sumsq.allocator()->allocate();
 }
 
-void NEL2Normalize::run()
+void NEL2NormalizeLayer::run()
 {
     _memory_group.acquire();
 

diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index a680f1f..0e149d4 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp

@@ -28,7 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
+#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
 #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/Tensor.h"

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index cb48598..b29b796 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -111,7 +111,7 @@
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _weights_reshaped.allocator()->allocate();

diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index 7877995..f865054 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp

@@ -31,18 +31,36 @@
 
 using namespace arm_compute;
 
-void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16)
+void NEMagnitude::configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type, bool use_fp16)
 {
     if(use_fp16)
     {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
+        if(mag_type == MagnitudeType::L1NORM)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
+        else
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseFP16Kernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
     }
     else
     {
-        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-        k->configure(input1, input2, output, nullptr);
-        _kernel = std::move(k);
+        if(mag_type == MagnitudeType::L1NORM)
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
+        else
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+            k->configure(input1, input2, output, nullptr);
+            _kernel = std::move(k);
+        }
     }
 }

diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index e01ef66..af98ac1 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp

@@ -37,9 +37,9 @@
 {
 }
 
-void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
 {
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
     _input_squared.allocator()->init(tensor_info);
@@ -56,6 +56,17 @@
     _input_squared.allocator()->allocate();
 }
 
+Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+{
+    // Perform validation step
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    return Status{};
+}
+
 void NENormalizationLayer::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index 436d22f..6392281 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp

@@ -30,9 +30,18 @@
 
 using namespace arm_compute;
 
-void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEPhase::configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
-    k->configure(input1, input2, nullptr, output);
-    _kernel = std::move(k);
+    if(phase_type == PhaseType::UNSIGNED)
+    {
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>>();
+        k->configure(input1, input2, nullptr, output);
+        _kernel = std::move(k);
+    }
+    else
+    {
+        auto k = arm_compute::support::cpp14::make_unique<NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>>();
+        k->configure(input1, input2, nullptr, output);
+        _kernel = std::move(k);
+    }
 }

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 2e2ea11..5a474e4 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -36,3 +36,7 @@
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
 }
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+    return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
+}

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index f8a85b9..530c7fc 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -48,6 +48,11 @@
     _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
 }
 
+Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return NEPoolingLayerKernel::validate(input, output, pool_info);
+}
+
 void NEPoolingLayer::run()
 {
     // Fill border

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index bbd3fac..bd565c9 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -42,9 +42,11 @@
 
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
+    ARM_COMPUTE_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_UNUSED(sampling_policy);
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -95,7 +97,7 @@
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == input);
     ARM_COMPUTE_ERROR_ON(nullptr == output);
@@ -131,13 +133,13 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_undefined, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, sampling_policy);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -149,7 +151,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_undefined, sampling_policy);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -157,7 +159,7 @@
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, sampling_policy);
             break;
         }
         case InterpolationPolicy::AREA:

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index cc5d4e9..8e6773c 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -36,9 +36,9 @@
 {
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Create intermediate tensors shapes
     TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position());
@@ -57,7 +57,7 @@
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
+    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
     _norm_kernel.configure(&_tmp, &_sum, output);
     _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE);
 
@@ -67,6 +67,23 @@
     _sum.allocator()->allocate();
 }
 
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+    // Perform validation step
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+
+    TensorShape max_sum_shape = input->tensor_shape();
+    max_sum_shape.set(0, 1);
+
+    TensorInfo tensor_info_max_sum(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(max_sum_shape));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DMaxKernel::validate(input, &tensor_info_max_sum));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DShiftExpSumKernel::validate(input, &tensor_info_max_sum, input, &tensor_info_max_sum, beta));
+    ARM_COMPUTE_RETURN_ON_ERROR(NELogits1DNormKernel::validate(input, &tensor_info_max_sum, output));
+
+    return Status{};
+}
+
 void NESoftmaxLayer::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index eb81e02..b5b28e8 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp

@@ -36,3 +36,8 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status NETranspose::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NETransposeKernel::validate(input, output);
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
new file mode 100644
index 0000000..3251de4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp

@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace
+{
+inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
+{
+    const int in_width    = input->info()->dimension(0);
+    const int in_height   = input->info()->dimension(1);
+    const int in_batches  = input->info()->dimension(3);
+    const int in_channels = input->info()->dimension(2);
+    return Tensor4DShape({ in_batches, in_height, in_width, in_channels });
+}
+} /* namespace */
+
+namespace arm_compute
+{
+NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+{
+} /* arm_compute */
+
+void NEWinogradLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(1) != 3 || weights->info()->dimension(0) != 3, "Only 3x3 kernels are supported");
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    _weights = weights;
+    _input   = input;
+    _output  = output;
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+    ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
+
+    // Get convolved dimensions
+    auto      padding     = PADDING_VALID;
+    const int in_channels = input->info()->dimension(2);
+
+    const int out_channels   = output->info()->dimension(2);
+    const int weights_width  = weights->info()->dimension(0);
+    const int weights_height = weights->info()->dimension(1);
+
+    const KernelShape   kernel_shape({ out_channels, weights_height, weights_width, in_channels });
+    const Tensor4DShape in_shape(internal_get_input_shape(input));
+
+    // Get the memory required to instantiate a new Winograd operator.
+    constexpr size_t kstore_alignment          = 64;
+    const size_t     kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape);
+    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_kernel_storage);
+
+    // Get workbench size and allocate memory
+    constexpr size_t wspace_alignment = 64;
+    const size_t     ws_size          = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_workspace);
+
+    // Workspace for weights transform
+    const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
+    _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_weights_workspace);
+
+    _kernel_storage.allocator()->allocate();
+    _workspace.allocator()->allocate();
+    _weights_workspace.allocator()->allocate();
+
+    // Create Winograd operator object
+    _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
+
+    // Configure the kernel, padding not needed so it's safe to call configure after allocare
+    _winograd_kernel.configure(output, _conv.get());
+}
+
+void NEWinogradLayer::run()
+{
+#if defined(__aarch64__)
+    _memory_group.acquire();
+    if(!_reshaped_kernel)
+    {
+        _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
+        _reshaped_kernel = true;
+    }
+    const Tensor4DShape in_shape(internal_get_input_shape(_input));
+    auto                padding = PADDING_VALID;
+
+    //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
+    _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
+
+    //Get ptrs into the workspace
+    std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
+
+    //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
+    _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
+
+    //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
+    NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
+
+    //Transform the output to the appropriate form
+    _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
+
+    //Transform back to NCHW
+    _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
+
+    _memory_group.release();
+#else  /* __aarch64__ */
+    ARM_COMPUTE_UNUSED(_winograd_kernel);
+    ARM_COMPUTE_UNUSED(_workspace);
+    ARM_COMPUTE_UNUSED(_kernel_storage);
+    ARM_COMPUTE_UNUSED(_input);
+    ARM_COMPUTE_UNUSED(_weights);
+    ARM_COMPUTE_UNUSED(_output);
+    ARM_COMPUTE_UNUSED(_reshaped_kernel);
+    ARM_COMPUTE_UNUSED(_conv);
+    ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a.");
+#endif /* __aarch64__ */
+}
+} // namespace arm_compute

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
new file mode 100644
index 0000000..4540aea
--- /dev/null
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OffsetLifetimeManager.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/OffsetMemoryPool.h"
+#include "support/ToolchainSupport.h"
+
+#include <algorithm>
+#include <cmath>
+#include <map>
+#include <vector>
+
+using namespace arm_compute;
+
+OffsetLifetimeManager::OffsetLifetimeManager()
+    : _blob(0)
+{
+}
+
+std::unique_ptr<IMemoryPool> OffsetLifetimeManager::create_pool(IAllocator *allocator)
+{
+    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
+    return support::cpp14::make_unique<OffsetMemoryPool>(allocator, _blob);
+}
+
+MappingType OffsetLifetimeManager::mapping_type() const
+{
+    return MappingType::OFFSETS;
+}
+
+void OffsetLifetimeManager::update_blobs_and_mappings()
+{
+    ARM_COMPUTE_ERROR_ON(!are_all_finalized());
+    ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
+
+    // Update blob size
+    size_t max_group_size = std::accumulate(std::begin(_active_elements), std::end(_active_elements), static_cast<size_t>(0), [](size_t s, const Element & e)
+    {
+        return s + e.size;
+    });
+    _blob = std::max(_blob, max_group_size);
+
+    // Calculate group mappings
+    auto &group_mappings = _active_group->mappings();
+    size_t offset         = 0;
+    for(auto &e : _active_elements)
+    {
+        group_mappings[e.handle] = offset;
+        offset += e.size;
+        ARM_COMPUTE_ERROR_ON(offset > _blob);
+    }
+}

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
new file mode 100644
index 0000000..96f54f8
--- /dev/null
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <algorithm>
+
+#include "arm_compute/runtime/OffsetMemoryPool.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, size_t blob_size)
+    : _allocator(allocator), _blob(), _blob_size(blob_size)
+{
+    ARM_COMPUTE_ERROR_ON(!allocator);
+    _blob = _allocator->allocate(_blob_size, 0);
+}
+
+OffsetMemoryPool::~OffsetMemoryPool()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    _allocator->free(_blob);
+    _blob = nullptr;
+}
+
+void OffsetMemoryPool::acquire(MemoryMappings &handles)
+{
+    ARM_COMPUTE_ERROR_ON(_blob == nullptr);
+
+    // Set memory to handlers
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = reinterpret_cast<uint8_t *>(_blob) + handle.second;
+    }
+}
+
+void OffsetMemoryPool::release(MemoryMappings &handles)
+{
+    for(auto &handle : handles)
+    {
+        ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
+        *handle.first = nullptr;
+    }
+}
+
+MappingType OffsetMemoryPool::mapping_type() const
+{
+    return MappingType::OFFSETS;
+}
+
+std::unique_ptr<IMemoryPool> OffsetMemoryPool::duplicate()
+{
+    ARM_COMPUTE_ERROR_ON(!_allocator);
+    return support::cpp14::make_unique<OffsetMemoryPool>(_allocator, _blob_size);
+}
\ No newline at end of file

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 272b9f5..a0d41b2 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "support/ToolchainSupport.h"
 
 #include <cstddef>
 
@@ -65,28 +66,23 @@
 } // namespace
 
 TensorAllocator::TensorAllocator(Tensor *owner)
-    : _associated_memory_group(nullptr), _buffer(nullptr), _owner(owner)
+    : _associated_memory_group(nullptr), _memory(), _owner(owner)
 {
 }
 
 TensorAllocator::~TensorAllocator()
 {
-    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
-    {
-        delete[] _buffer;
-        _buffer = nullptr;
-        info().set_is_resizable(true);
-    }
+    info().set_is_resizable(true);
 }
 
 TensorAllocator::TensorAllocator(TensorAllocator &&o) noexcept
     : ITensorAllocator(std::move(o)),
       _associated_memory_group(o._associated_memory_group),
-      _buffer(o._buffer),
+      _memory(std::move(o._memory)),
       _owner(o._owner)
 {
     o._associated_memory_group = nullptr;
-    o._buffer                  = nullptr;
+    o._memory                  = Memory();
     o._owner                   = nullptr;
 }
 
@@ -97,8 +93,8 @@
         _associated_memory_group   = o._associated_memory_group;
         o._associated_memory_group = nullptr;
 
-        _buffer   = o._buffer;
-        o._buffer = nullptr;
+        _memory   = std::move(o._memory);
+        o._memory = Memory();
 
         _owner   = o._owner;
         o._owner = nullptr;
@@ -118,7 +114,7 @@
     ARM_COMPUTE_UNUSED(validate_subtensor_shape);
 
     // Copy pointer to buffer
-    _buffer = allocator._buffer;
+    _memory = Memory(allocator._memory.buffer());
 
     // Init tensor info with new dimensions
     size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
@@ -130,44 +126,53 @@
 
 uint8_t *TensorAllocator::data() const
 {
-    return _buffer;
+    return _memory.buffer();
 }
 
 void TensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
     if(_associated_memory_group == nullptr)
     {
-        _buffer = new uint8_t[info().total_size()]();
+        _memory = Memory(std::shared_ptr<uint8_t>(new uint8_t[info().total_size()](), [](uint8_t *ptr)
+        {
+            delete[] ptr;
+        }));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_buffer), info().total_size());
+        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.handle()), info().total_size());
     }
     info().set_is_resizable(false);
 }
 
 void TensorAllocator::free()
 {
-    if((_associated_memory_group == nullptr) && (_buffer != nullptr))
-    {
-        delete[] _buffer;
-        _buffer = nullptr;
-        info().set_is_resizable(true);
-    }
+    _memory = Memory();
+    info().set_is_resizable(true);
+}
+
+arm_compute::Status TensorAllocator::import_memory(Memory memory)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(memory.buffer() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
+    _memory = memory;
+    info().set_is_resizable(false);
+
+    return Status{};
 }
 
 void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
 {
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_buffer != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.buffer() != nullptr);
     _associated_memory_group = associated_memory_group;
 }
 
 uint8_t *TensorAllocator::lock()
 {
-    return _buffer;
+    return _memory.buffer();
 }
 
 void TensorAllocator::unlock()
commit	8140e1e155d3430992fa46e04ef8938ff09ffd2d	[log] [tgz]
author	Anthony Barbier <Anthony.barbier@arm.com>	Thu Dec 14 23:48:46 2017 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed Jan 24 10:01:21 2018 +0000
tree	9bcf86d01635bfc73e8debd1bda75e6f75b8b406
parent	8a3da6f91f90c566b844d568f4ec43b946915af8 [diff]