arm_compute v18.11

commit: b9abeae0897bef74553ba9800c4ff5f74131c5b4 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Thu Nov 22 11:58:08 2018 +0000
committer: Jenkins <bsgcomp@arm.com> Thu Nov 22 11:58:08 2018 +0000
tree: 69f512b9d2d7d9169960592cca948dc6e8e5b936
parent: 52ba29e936b8e711e8acdfe819e36f884d4f3fe1 [diff]
diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 3c45ab3..70235a2 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -53,7 +53,10 @@
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    if(_info->dimension(0) > 1)
+    {
+        anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
+    }
     anchor.set(1, std::max<int>(window.x().start() * _scale_y, anchor[0] + border_size.left) + _y);
 
     // End of the valid region is equal to the start of the last write of the
@@ -66,8 +69,11 @@
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    shape.set(0, std::min<int>((old_anchor[1] + old_shape[1]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[0]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    if(_info->dimension(0) > 1)
+    {
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+    }
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
@@ -192,9 +198,9 @@
     ARM_COMPUTE_ERROR_ON(window.x().step() == 0);
 
     const int min_x = window.y().start() * _scale_x + _x;
-    const int max_x = window.y().end() * _scale_x + _x;
+    const int max_x = (window.y().end() - window.y().step()) * _scale_x + _x + _width;
     const int min_y = window.x().start() * _scale_y + _y;
-    const int max_y = window.x().end() * _scale_y + _y;
+    const int max_y = (window.x().end() - window.x().step()) * _scale_y + _y + _height;
 
     const TensorShape &shape = _info->tensor_shape();
 

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 9703b0f..0947d58 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -64,6 +64,36 @@
     }
 }
 
+std::string get_cl_select_type_from_data_type(const DataType &dt)
+{
+    switch(dt)
+    {
+        case DataType::U8:
+            return "uchar";
+        case DataType::S8:
+            return "char";
+        case DataType::QASYMM8:
+            return "uchar";
+        case DataType::U16:
+            return "ushort";
+        case DataType::F16:
+        case DataType::S16:
+            return "short";
+        case DataType::U32:
+            return "uint";
+        case DataType::F32:
+        case DataType::S32:
+            return "int";
+        case DataType::U64:
+            return "ulong";
+        case DataType::S64:
+            return "long";
+        default:
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+            return "";
+    }
+}
+
 std::string get_data_size_from_data_type(const DataType &dt)
 {
     switch(dt)
@@ -114,7 +144,12 @@
 
 bool dot8_supported(const cl::Device &device)
 {
-    return device_supports_extension(device, "cl_arm_integer_dot_product_int8");
+    std::string     device_name = device.getInfo<CL_DEVICE_NAME>();
+    const GPUTarget gpu_target  = get_target_from_name(device_name);
+
+    // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
+    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
 }
 
 bool dot8_acc_supported(const cl::Device &device)

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 3c92257..ff4803e 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -152,19 +152,26 @@
     { "arithmetic_add_quantized", "arithmetic_op_quantized.cl" },
     { "arithmetic_add", "arithmetic_op.cl" },
     { "arithmetic_sub", "arithmetic_op.cl" },
+    { "arithmetic_sub_quantized", "arithmetic_op_quantized.cl" },
     { "arithmetic_div", "arithmetic_op.cl" },
+    { "batch_to_space_nchw", "batch_to_space.cl" },
+    { "batch_to_space_static_nchw", "batch_to_space.cl" },
+    { "batch_to_space_nhwc", "batch_to_space.cl" },
+    { "batch_to_space_static_nhwc", "batch_to_space.cl" },
     { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
     { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
     { "bitwise_or", "bitwise_op.cl" },
     { "bitwise_and", "bitwise_op.cl" },
     { "bitwise_xor", "bitwise_op.cl" },
     { "bitwise_not", "bitwise_op.cl" },
+    { "bounding_box_transform", "bounding_box_transform.cl" },
     { "channel_combine_NV", "channel_combine.cl" },
     { "channel_combine_RGB888", "channel_combine.cl" },
     { "channel_combine_RGBA8888", "channel_combine.cl" },
     { "channel_combine_UYVY422", "channel_combine.cl" },
     { "channel_combine_YUYV422", "channel_combine.cl" },
     { "channel_shuffle_nchw", "channel_shuffle.cl" },
+    { "channel_shuffle_nhwc", "channel_shuffle.cl" },
     { "channel_extract_NV12", "channel_extract.cl" },
     { "channel_extract_NV21", "channel_extract.cl" },
     { "channel_extract_RGB888", "channel_extract.cl" },
@@ -175,6 +182,8 @@
     { "combine_gradients_L2", "canny.cl" },
     { "concatenate_depth", "concatenate.cl" },
     { "concatenate_width", "concatenate.cl" },
+    { "concatenate_width_x2", "concatenate.cl" },
+    { "concatenate_width_x4", "concatenate.cl" },
     { "convolution_rectangle", "convolution_rectangle.cl" },
     { "col2im", "col2im.cl" },
     { "convert_depth_down", "depth_convert.cl" },
@@ -191,6 +200,7 @@
     { "convolution_separable1x9_static", "convolution9x9.cl" },
     { "convolution_separable9x1_static", "convolution9x9.cl" },
     { "copy_tensor", "copy_tensor.cl" },
+    { "copy_pad_tensor", "copy_tensor.cl" },
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
@@ -230,6 +240,7 @@
     { "fill_image_borders_constant", "fill_border.cl" },
     { "fill_image_borders_replicate", "fill_border.cl" },
     { "finalize", "optical_flow_pyramid_lk.cl" },
+    { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
     { "floor_layer", "floor.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
@@ -240,16 +251,19 @@
     { "gemm_mv", "gemv.cl" },
     { "gemm_mv_quantized", "gemv.cl" },
     { "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
+    { "gemm_mm_interleaved_transposed_f16_acc32", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
     { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point", "gemm.cl" },
     { "gemm_mm_floating_point_f16_bifrost", "gemm.cl" },
+    { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost", "gemm.cl" },
     { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_transpose1xW", "gemm.cl" },
     { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
+    { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
     { "gemmlowp_mm_bifrost", "gemmlowp.cl" },
     { "gemmlowp_mm_bifrost_dot8", "gemmlowp.cl" },
@@ -258,8 +272,12 @@
     { "gemmlowp_mm_interleaved_transposed_bifrost_dot8", "gemmlowp.cl" },
     { "gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl" },
     { "gemmlowp_offset_contribution", "gemmlowp.cl" },
+    { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
+    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
     { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
     { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
+    { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
     { "harris_score_3x3", "harris_corners.cl" },
     { "harris_score_5x5", "harris_corners.cl" },
     { "harris_score_7x7", "harris_corners.cl" },
@@ -288,11 +306,14 @@
     { "IYUV_to_RGB888_bt709", "color_convert.cl" },
     { "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
     { "IYUV_to_YUV444_bt709", "color_convert.cl" },
-    { "l2_normalize", "l2_normalize.cl" },
+    { "l2_normalize_x", "l2_normalize.cl" },
+    { "l2_normalize_y", "l2_normalize.cl" },
+    { "l2_normalize_z", "l2_normalize.cl" },
     { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
     { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
     { "magnitude_phase", "magnitude_phase.cl" },
     { "mean_stddev_accumulate", "mean_stddev.cl" },
+    { "memset", "memset.cl" },
     { "minmax", "minmaxloc.cl" },
     { "minmax_border", "minmaxloc.cl" },
     { "minmax_layer", "minmax_layer.cl" },
@@ -306,6 +327,10 @@
     { "non_max_suppression", "nonmax.cl" },
     { "normalization_layer_cross_map", "normalization_layer.cl" },
     { "normalization_layer_in_map", "normalization_layer.cl" },
+    { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
+    { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
+    { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
+    { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
     { "NV12_to_IYUV_bt709", "color_convert.cl" },
     { "NV12_to_RGB888_bt709", "color_convert.cl" },
     { "NV12_to_RGBA8888_bt709", "color_convert.cl" },
@@ -320,6 +345,7 @@
     { "permute_3201", "permute.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
+    { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
     { "pooling_layer_2", "pooling_layer.cl" },
     { "pooling_layer_3", "pooling_layer.cl" },
     { "pooling_layer_optimized_3", "pooling_layer.cl" },
@@ -328,25 +354,37 @@
     { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
+    { "prior_box_layer_nchw", "prior_box_layer.cl" },
+    { "prior_box_layer_nhwc", "prior_box_layer.cl" },
     { "quantization_layer", "quantization_layer.cl" },
-    { "reduction_operation", "reduction_operation.cl" },
+    { "reduction_operation_x", "reduction_operation.cl" },
+    { "reduction_operation_quantized_x", "reduction_operation.cl" },
+    { "reduction_operation_y", "reduction_operation.cl" },
+    { "reduction_operation_z", "reduction_operation.cl" },
+    { "reduction_operation_w", "reduction_operation.cl" },
     { "remap_nearest_neighbour", "remap.cl" },
     { "remap_bilinear", "remap.cl" },
+    { "reorg_layer_nchw", "reorg_layer.cl" },
+    { "reorg_layer_nhwc", "reorg_layer.cl" },
     { "reshape_layer", "reshape_layer.cl" },
     { "reshape_to_columns", "convolution_layer.cl" },
     { "RGB888_to_IYUV_bt709", "color_convert.cl" },
     { "RGB888_to_NV12_bt709", "color_convert.cl" },
     { "RGB888_to_RGBA8888_bt709", "color_convert.cl" },
+    { "RGB888_to_U8_bt709", "color_convert.cl" },
     { "RGB888_to_YUV444_bt709", "color_convert.cl" },
     { "RGBA8888_to_IYUV_bt709", "color_convert.cl" },
     { "RGBA8888_to_NV12_bt709", "color_convert.cl" },
     { "RGBA8888_to_RGB888_bt709", "color_convert.cl" },
     { "RGBA8888_to_YUV444_bt709", "color_convert.cl" },
+    { "roi_align_layer", "roi_align_layer.cl" },
     { "roi_pooling_layer", "roi_pooling_layer.cl" },
     { "scale_nearest_neighbour_nchw", "scale.cl" },
     { "scale_nearest_neighbour_nhwc", "scale.cl" },
     { "scale_bilinear_nchw", "scale.cl" },
     { "scale_bilinear_nhwc", "scale.cl" },
+    { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
+    { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
     { "scharr3x3", "scharr_filter.cl" },
     { "sobel3x3", "sobel_filter.cl" },
     { "sobel_separable5x1", "sobel_filter.cl" },
@@ -358,7 +396,12 @@
     { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
     { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
+    { "space_to_batch_nchw", "space_to_batch.cl" },
+    { "space_to_batch_static_nchw", "space_to_batch.cl" },
+    { "space_to_batch_nhwc", "space_to_batch.cl" },
+    { "space_to_batch_static_nhwc", "space_to_batch.cl" },
     { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
+    { "strided_slice", "slice_ops.cl" },
     { "suppress_non_maximum", "canny.cl" },
     { "tablelookup_U8", "tablelookup.cl" },
     { "tablelookup_S16", "tablelookup.cl" },
@@ -369,6 +412,8 @@
     { "UYVY422_to_NV12_bt709", "color_convert.cl" },
     { "UYVY422_to_RGB888_bt709", "color_convert.cl" },
     { "UYVY422_to_RGBA8888_bt709", "color_convert.cl" },
+    { "upsample_layer_nchw", "upsample_layer.cl" },
+    { "upsample_layer_nhwc", "upsample_layer.cl" },
     { "warp_affine_nearest_neighbour", "warp_affine.cl" },
     { "warp_affine_bilinear", "warp_affine.cl" },
     { "warp_perspective_nearest_neighbour", "warp_perspective.cl" },
@@ -421,6 +466,8 @@
     { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
     { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
+    { "yolo_layer_nchw", "yolo_layer.cl" },
+    { "yolo_layer_nhwc", "yolo_layer.cl" },
     { "YUYV422_to_IYUV_bt709", "color_convert.cl" },
     { "YUYV422_to_NV12_bt709", "color_convert.cl" },
     { "YUYV422_to_RGB888_bt709", "color_convert.cl" },
@@ -455,10 +502,18 @@
 #include "./cl_kernels/arithmetic_op_quantized.clembed"
     },
     {
+        "batch_to_space.cl",
+#include "./cl_kernels/batch_to_space.clembed"
+    },
+    {
         "bitwise_op.cl",
 #include "./cl_kernels/bitwise_op.clembed"
     },
     {
+        "bounding_box_transform.cl",
+#include "./cl_kernels/bounding_box_transform.clembed"
+    },
+    {
         "canny.cl",
 #include "./cl_kernels/canny.clembed"
     },
@@ -519,6 +574,10 @@
 #include "./cl_kernels/copy_tensor.clembed"
     },
     {
+        "upsample_layer.cl",
+#include "./cl_kernels/upsample_layer.clembed"
+    },
+    {
         "deconvolution_layer.cl",
 #include "./cl_kernels/deconvolution_layer.clembed"
     },
@@ -599,6 +658,10 @@
 #include "./cl_kernels/gemv.clembed"
     },
     {
+        "generate_proposals.cl",
+#include "./cl_kernels/generate_proposals.clembed"
+    },
+    {
         "harris_corners.cl",
 #include "./cl_kernels/harris_corners.clembed"
     },
@@ -639,6 +702,10 @@
 #include "./cl_kernels/mean_stddev.clembed"
     },
     {
+        "memset.cl",
+#include "./cl_kernels/memset.clembed"
+    },
+    {
         "minmaxloc.cl",
 #include "./cl_kernels/minmaxloc.clembed"
     },
@@ -667,6 +734,14 @@
 #include "./cl_kernels/normalization_layer.clembed"
     },
     {
+        "normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/normalize_planar_yuv_layer.clembed"
+    },
+    {
+        "normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed"
+    },
+    {
         "batchnormalization_layer.cl",
 #include "./cl_kernels/batchnormalization_layer.clembed"
     },
@@ -695,6 +770,10 @@
 #include "./cl_kernels/pooling_layer_quantized.clembed"
     },
     {
+        "prior_box_layer.cl",
+#include "./cl_kernels/prior_box_layer.clembed"
+    },
+    {
         "quantization_layer.cl",
 #include "./cl_kernels/quantization_layer.clembed"
     },
@@ -707,10 +786,18 @@
 #include "./cl_kernels/remap.clembed"
     },
     {
+        "reorg_layer.cl",
+#include "./cl_kernels/reorg_layer.clembed"
+    },
+    {
         "reshape_layer.cl",
 #include "./cl_kernels/reshape_layer.clembed"
     },
     {
+        "roi_align_layer.cl",
+#include "./cl_kernels/roi_align_layer.clembed"
+    },
+    {
         "roi_pooling_layer.cl",
 #include "./cl_kernels/roi_pooling_layer.clembed"
     },
@@ -719,6 +806,10 @@
 #include "./cl_kernels/scale.clembed"
     },
     {
+        "scale_quantized.cl",
+#include "./cl_kernels/scale_quantized.clembed"
+    },
+    {
         "scharr_filter.cl",
 #include "./cl_kernels/scharr_filter.clembed"
     },
@@ -735,6 +826,14 @@
 #include "./cl_kernels/softmax_layer_quantized.clembed"
     },
     {
+        "slice_ops.cl",
+#include "./cl_kernels/slice_ops.clembed"
+    },
+    {
+        "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
+    },
+    {
         "tablelookup.cl",
 #include "./cl_kernels/tablelookup.clembed"
     },
@@ -774,6 +873,10 @@
         "winograd_output_transform.cl",
 #include "./cl_kernels/winograd_output_transform.clembed"
     },
+    {
+        "yolo_layer.cl",
+#include "./cl_kernels/yolo_layer.clembed"
+    },
 #endif /* EMBEDDED_KERNELS */
 };
 

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 491e0c4..995fcb4 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -110,10 +110,12 @@
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
+#ifndef DOXYGEN_SKIP_THIS
 template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window);
 template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window);
+#endif /* DOXYGEN_SKIP_THIS */
 
 void ICLKernel::set_target(cl::Device &device)
 {

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 486bb6a..6725f36 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -106,6 +106,7 @@
     LOAD_FUNCTION_PTR(clReleaseMemObject, handle);
     LOAD_FUNCTION_PTR(clGetDeviceInfo, handle);
     LOAD_FUNCTION_PTR(clGetDeviceIDs, handle);
+    LOAD_FUNCTION_PTR(clGetMemObjectInfo, handle);
     LOAD_FUNCTION_PTR(clRetainEvent, handle);
     LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
     LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
@@ -796,6 +797,24 @@
     }
 }
 
+cl_int clGetMemObjectInfo(cl_mem      memobj,
+                          cl_mem_info param_name,
+                          size_t      param_value_size,
+                          void       *param_value,
+                          size_t     *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
 cl_int clRetainEvent(cl_event event)
 {
     arm_compute::CLSymbols::get().load_default();

diff --git a/src/core/CL/cl_kernels/activation_helpers.h b/src/core/CL/cl_kernels/activation_helpers.h
new file mode 100644
index 0000000..dfab082
--- /dev/null
+++ b/src/core/CL/cl_kernels/activation_helpers.h

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(TYPE) && defined(SELECT_TYPE)
+
+#define CONST_ONE 1.f
+#define ABS_OP(a) fabs((a))
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define MLA_OP(a, b, c) ((b) * (c) + (a))
+#define DIV_OP(a, b) ((a) / (b))
+#define EXP_OP(a) exp((a))
+#define LOG_OP(a) log((a))
+#define SQRT_OP(a) sqrt((a))
+#define TANH_OP(a) tanh((a))
+
+// Logistic Activation
+inline TYPE logistic_op(TYPE x)
+{
+    return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
+}
+// Hyperbolic Tangent Activation
+inline TYPE tanh_op(TYPE x)
+{
+    return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
+}
+// RELU Tangent Activation
+inline TYPE relu_op(TYPE x)
+{
+    return max((TYPE)0, x);
+}
+// Bounded RELU Activation
+inline TYPE brelu_op(TYPE x)
+{
+    return min((TYPE)A_VAL, max((TYPE)0, x));
+}
+// Lower Upper Bounded RELU Activation
+inline TYPE lu_brelu_op(TYPE x)
+{
+    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
+}
+// Leaky RELU Activation
+inline TYPE lrelu_op(TYPE x)
+{
+    return select(MUL_OP((TYPE)A_VAL, x), x, CONVERT(x > (TYPE)0, SELECT_TYPE));
+}
+// Soft RELU Activation
+inline TYPE srelu_op(TYPE x)
+{
+    return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
+}
+// Absolute Activation
+inline TYPE abs_op(TYPE x)
+{
+    return ABS_OP(x);
+}
+// Square Activation
+inline TYPE square_op(TYPE x)
+{
+    return MUL_OP(x, x);
+}
+// Square-root Activation
+inline TYPE sqrt_op(TYPE x)
+{
+    return SQRT_OP(x);
+}
+// Linear Activation
+inline TYPE linear_op(TYPE x)
+{
+    return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
+}
+
+#define ACTIVATION_OP2(op, x) op##_op(x)
+#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+
+#endif // defined(TYPE) && defined(SELECT_TYPE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index 373406a..cf1f434 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl

@@ -21,80 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "helpers.h"
-
 #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
 
-#define CONST_ONE 1.f
-#define ABS_OP(a) fabs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define TANH_OP(a) tanh((a))
-
-// Logistic Activation
-inline TYPE logistic_op(TYPE x)
-{
-    return DIV_OP((TYPE)CONST_ONE, ADD_OP((TYPE)CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-inline TYPE tanh_op(TYPE x)
-{
-    return MUL_OP((TYPE)A_VAL, TANH_OP(MUL_OP((TYPE)B_VAL, x)));
-}
-// RELU Tangent Activation
-inline TYPE relu_op(TYPE x)
-{
-    return max(0, x);
-}
-// Bounded RELU Activation
-inline TYPE brelu_op(TYPE x)
-{
-    return min((TYPE)A_VAL, max(0, x));
-}
-// Lower Upper Bounded RELU Activation
-inline TYPE lu_brelu_op(TYPE x)
-{
-    return min(max(x, (TYPE)B_VAL), (TYPE)A_VAL);
-}
-// Leaky RELU Activation
-inline TYPE lrelu_op(TYPE x)
-{
-    return select(MUL_OP((TYPE)A_VAL, x), x, x > (TYPE)0);
-}
-// Soft RELU Activation
-inline TYPE srelu_op(TYPE x)
-{
-    return LOG_OP(ADD_OP((TYPE)CONST_ONE, EXP_OP(x)));
-}
-// Absolute Activation
-inline TYPE abs_op(TYPE x)
-{
-    return ABS_OP(x);
-}
-// Square Activation
-inline TYPE square_op(TYPE x)
-{
-    return MUL_OP(x, x);
-}
-// Square-root Activation
-inline TYPE sqrt_op(TYPE x)
-{
-    return SQRT_OP(x);
-}
-// Linear Activation
-inline TYPE linear_op(TYPE x)
-{
-    return MLA_OP((TYPE)B_VAL, (TYPE)A_VAL, x);
-}
-
-#define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#include "activation_helpers.h"
 
 #if defined(ACT)
 

diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 9efb71b..557615e 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl

@@ -33,11 +33,13 @@
 
 #define DIV(x, y) (x) / (y)
 
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE)
 /** This function adds two tensors.
  *
  * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
  * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -75,14 +77,16 @@
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_a = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
-    VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
-    in_b = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, 16));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
 
     // Calculate and store result
-    vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
+    VSTORE(VEC_SIZE)
+    (ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE) */
 
 /** This function subtracts one tensor from another.
  *

diff --git a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
index 082317b..fc7fa77 100644
--- a/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op_quantized.cl

@@ -31,12 +31,27 @@
 #define SUB(x, y) (x) - (y)
 #endif /* SATURATE */
 
-#if defined(OFFSET_IN1)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+
+#if defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
 
 /** This function adds two tensors.
  *
- * @attention The quantization offset must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  *
  * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
  * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -73,6 +88,69 @@
     Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+    const VEC_FLOAT qresf32 = (in1f32 + in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_UCHAR res     = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) */
+
+/** This function subtracts two tensors.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void arithmetic_sub_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
     int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
     int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
 
@@ -81,10 +159,10 @@
 
     const float16 in1f32  = convert_float16(in_a) * (float16)((float)SCALE_IN1);
     const float16 in2f32  = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const float16 qresf32 = (in1f32 + in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
+    const float16 qresf32 = (in1f32 - in2f32) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
     const uchar16 res     = convert_uchar16_sat(convert_int16_rte(qresf32));
 
     // Store result
     vstore16(res, 0, (__global uchar *)out.ptr);
 }
-#endif /* defined(OFFSET) */
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */

diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
new file mode 100644
index 0000000..8506fc3
--- /dev/null
+++ b/src/core/CL/cl_kernels/batch_to_space.cl

@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 5352af3..dfd16e0 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -23,14 +23,14 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-
 #define ADD_OP(a, b) ((a) + (b))
 #define SUB_OP(a, b) ((a) - (b))
 #define MUL_OP(a, b) ((a) * (b))
 #define INVSQRT_OP(a) rsqrt((a))
 #define SQCVT_SAT(a) (a)
 
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+
 #if defined(FUSED_ACTIVATION)
 #include "activation_layer.cl"
 #define ACTIVATION_FUNC(x) ACTIVATION_OP(FUSED_ACTIVATION, x)
@@ -258,3 +258,161 @@
     (res, 0, (__global DATA_TYPE *)out.ptr);
 }
 #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) */
+
+#if defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON)
+/** Fuse batchnorm parameters to convolution layer parameters
+ *
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DNUM_CHANNELS=size. e.g. -DNUM_CHANNELS=16
+ * @attention Batch normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  conv_w_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  conv_w_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  conv_w_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  conv_w_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  conv_w_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  conv_w_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  conv_w_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  conv_w_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  conv_w_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  conv_w_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  bn_mean_ptr                            Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_mean_stride_x                       Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  bn_mean_step_x                         bn_mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_mean_offset_first_element_in_bytes  The offset of the first element in the mean source tensor
+ * @param[in]  bn_var_ptr                             Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_var_stride_x                        Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  bn_var_step_x                          bn_var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_var_offset_first_element_in_bytes   The offset of the first element in the var source tensor
+ * @param[out] fused_w_ptr                            Pointer to the destination weights tensors. Supported data types: same as @p input_ptr
+ * @param[in]  fused_w_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  fused_w_step_x                         fused_w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  fused_w_step_y                         fused_w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  fused_w_step_z                         fused_w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  fused_w_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  fused_w_step_w                         fused_w_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  fused_w_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  fused_b_ptr                            Pointer to the destination bias tensor. Supported data types: same as @p input_ptr
+ * @param[in]  fused_b_stride_x                       Stride of the bias source tensor in X dimension (in bytes)
+ * @param[in]  fused_b_step_x                         fused_b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  fused_b_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  conv_b_ptr                             Pointer to the source bias tensor. Supported data types: same as @p input_ptr
+ * @param[in]  conv_b_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  conv_b_step_x                          conv_b_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  conv_b_offset_first_element_in_bytes   The offset of the first element in the source bias tensor
+ * @param[in]  bn_beta_ptr                            Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_beta_stride_x                       Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  bn_beta_step_x                         bn_beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_beta_offset_first_element_in_bytes  The offset of the first element in the beta source tensor
+ * @param[in]  bn_gamma_ptr                           Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  bn_gamma_stride_x                      Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  bn_gamma_step_x                        bn_gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bn_gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                                Epsilon parameter in the batch normalization equation
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR4D_DECLARATION(conv_w),
+                                            VECTOR_DECLARATION(bn_mean),
+                                            VECTOR_DECLARATION(bn_var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR4D_DECLARATION(fused_w)
+#endif /* not IN_PLACE_W */
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(fused_b)
+#endif /* not IN_PLACE_B */
+#ifdef HAS_BIAS
+                                            ,
+                                            VECTOR_DECLARATION(conv_b)
+#endif /* HAS_BIAS */
+#ifndef USE_DEFAULT_BETA
+                                            ,
+                                            VECTOR_DECLARATION(bn_beta)
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            ,
+                                            VECTOR_DECLARATION(bn_gamma)
+#endif /* USE_DEFAULT_GAMMA */
+                                           )
+{
+    Tensor4D conv_w  = CONVERT_TO_TENSOR4D_STRUCT(conv_w, NUM_CHANNELS);
+    Vector   bn_mean = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_mean);
+    Vector   bn_var  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_var);
+
+    // In-place ops
+#ifdef IN_PLACE_W
+    Tensor4D fused_w = conv_w;
+#else  /* IN_PLACE_W */
+    Tensor4D  fused_w                      = CONVERT_TO_TENSOR4D_STRUCT(fused_w, NUM_CHANNELS);
+#endif /* IN_PLACE */
+#ifdef IN_PLACE_B
+    Vector fused_b = conv_b;
+#else  /* IN_PLACE_W */
+    Vector    fused_b                      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(fused_b);
+#endif /* IN_PLACE */
+
+    // Conditional ops
+#ifdef HAS_BIAS
+    Vector conv_b = CONVERT_TO_VECTOR_STRUCT_NO_STEP(conv_b);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_BETA
+    Vector bn_beta = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector bn_gamma = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bn_gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    const int current_slice = get_global_id(2) / NUM_CHANNELS;
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    conv_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * conv_w_stride_x;
+    fused_w.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * fused_w_stride_x;
+
+    // Load W
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    wn = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)conv_w.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    DATA_TYPE wn                           = *((__global DATA_TYPE *)(conv_w.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+    // rvar = 1 / sqrt(var + epsilon)
+    const DATA_TYPE var  = *((__global DATA_TYPE *)(bn_var.ptr + current_slice * bn_var.stride_x));
+    const DATA_TYPE rvar = INVSQRT_OP(ADD_OP(var, SQCVT_SAT((float)EPSILON)));
+    wn *= rvar;
+
+    // Load b
+    const DATA_TYPE mean = *((__global DATA_TYPE *)(bn_mean.ptr + current_slice * bn_mean.stride_x));
+    DATA_TYPE bn         = 0;
+#ifdef HAS_BIAS
+    bn = *((__global DATA_TYPE *)(conv_b.ptr + current_slice * conv_b.stride_x));
+#endif /* HAS_BIAS */
+    bn = (bn - mean) * rvar;
+
+#ifndef USE_DEFAULT_GAMMA
+    const DATA_TYPE gamma_scalar = *((__global DATA_TYPE *)(bn_gamma.ptr + current_slice * bn_gamma.stride_x));
+    wn *= gamma_scalar;
+    bn *= gamma_scalar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    const DATA_TYPE beta_scalar = *((__global DATA_TYPE *)(bn_beta.ptr + current_slice * bn_beta.stride_x));
+    bn += beta_scalar;
+#endif /* USE_DEFAULT_BETA */
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Store updated weights
+    VSTORE(VEC_SIZE)
+    (wn, 0, (__global DATA_TYPE *)fused_w.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(fused_w.ptr)) = wn;
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+
+    // Store updated bias
+    *((__global DATA_TYPE *)(fused_b.ptr + current_slice * fused_b.stride_x)) = bn;
+}
+#endif /* defined(NUM_CHANNELS) && defined(DATA_TYPE) && defined(EPSILON) */

diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
new file mode 100644
index 0000000..0972355
--- /dev/null
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl

@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: F16/F32
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: same as @p in_ptr
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const DATA_TYPE one     = (DATA_TYPE)1.f;
+    const DATA_TYPE halfone = (DATA_TYPE)0.5f;
+
+    const int py = get_global_id(1); // box
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    dims = box.s23 - box.s01 + one;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    ctr = box.s01 + halfone * dims;
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_ctr = delta.s01 * dims + ctr;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+
+#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)

diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 23962e1..3ac67c5 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl

@@ -23,19 +23,28 @@
 */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K)
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
 
-// Check valid BLOCK_SIZES
-#if BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16
-#error "Only block sizes 4, 8 and 16 are supported"
-#endif /* BLOCK_SIZE != 4 && BLOCK_SIZE != 8 && BLOCK_SIZE != 16 */
+// Check valid VEC_SIZES
+#if VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
 
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, BLOCK_SIZE)
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
-/** Perfoms channel shuffle see https://arxiv.org/pdf/1707.01083.pdf for details.
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
  *
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group should be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
  * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -45,6 +54,8 @@
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -53,80 +64,118 @@
  * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void channel_shuffle_nchw(TENSOR3D_DECLARATION(src),
-                                   TENSOR3D_DECLARATION(dst))
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
 
-    const uint curr_channel = get_global_id(2); // channel id of input
-    const uint group_id     = curr_channel / K; // group id
-    const uint channel_id   = curr_channel % K; // channel id within the group
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
 
-    const uint x = get_global_id(0) * BLOCK_SIZE;
-    const uint y = get_global_id(1) * BLOCK_SIZE;
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
     const uint z = channel_id * NUM_GROUPS + group_id;
 
-    // Load the NxN block
-    TYPE u0 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 0, 0));
-    TYPE u1 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 1, 0));
-    TYPE u2 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 2, 0));
-    TYPE u3 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 3, 0));
-#if BLOCK_SIZE > 4
-    TYPE u4 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 4, 0));
-    TYPE u5 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 5, 0));
-    TYPE u6 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 6, 0));
-    TYPE u7 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 7, 0));
-#if BLOCK_SIZE == 16
-    TYPE u8  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 8, 0));
-    TYPE u9  = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 9, 0));
-    TYPE u10 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 10, 0));
-    TYPE u11 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 11, 0));
-    TYPE u12 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 12, 0));
-    TYPE u13 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 13, 0));
-    TYPE u14 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 14, 0));
-    TYPE u15 = VLOAD(BLOCK_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, 0, 15, 0));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
 
     // Store blocks
-    VSTORE(BLOCK_SIZE)
-    (u0, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 0, z));
-    VSTORE(BLOCK_SIZE)
-    (u1, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 1, z));
-    VSTORE(BLOCK_SIZE)
-    (u2, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 2, z));
-    VSTORE(BLOCK_SIZE)
-    (u3, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 3, z));
-#if BLOCK_SIZE > 4
-    VSTORE(BLOCK_SIZE)
-    (u4, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 4, z));
-    VSTORE(BLOCK_SIZE)
-    (u5, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 5, z));
-    VSTORE(BLOCK_SIZE)
-    (u6, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 6, z));
-    VSTORE(BLOCK_SIZE)
-    (u7, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 7, z));
-#if BLOCK_SIZE == 16
-    VSTORE(BLOCK_SIZE)
-    (u8, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 8, z));
-    VSTORE(BLOCK_SIZE)
-    (u9, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 9, z));
-    VSTORE(BLOCK_SIZE)
-    (u10, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 10, z));
-    VSTORE(BLOCK_SIZE)
-    (u11, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 11, z));
-    VSTORE(BLOCK_SIZE)
-    (u12, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 12, z));
-    VSTORE(BLOCK_SIZE)
-    (u13, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 13, z));
-    VSTORE(BLOCK_SIZE)
-    (u14, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 14, z));
-    VSTORE(BLOCK_SIZE)
-    (u15, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, x, y + 15, z));
-#endif /* BLOCK_SIZE == 16 */
-#endif /* BLOCK_SIZE > 4 */
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
 }
-#endif /* defined(DATA_TYPE) && defined(BLOCK_SIZE) && defined(NUM_GROUPS) && defined(K) */
+
+#if VEC_SIZE == 4 && defined(LAST_ACCESSED)
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note This implementation is only defined for VEC_SIZE = 4
+ * @note This last element accessed along the first dimension must be given as a preprocessor argument using -DLAST_ACCESSED=num. e.g. -DLAST_ACCESSED=64 in order to prevent out-of-bound writes.
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The height of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    const uint curr_channel = min((uint)(get_global_id(0) * VEC_SIZE), (uint)LAST_ACCESSED); // input feature map
+    uint       channel_id0  = 0;
+    uint       channel_id1  = 0;
+    uint       channel_id2  = 0;
+    uint       channel_id3  = 0;
+    uint       group_id0    = 0;
+    uint       group_id1    = 0;
+    uint       group_id2    = 0;
+    uint       group_id3    = 0;
+    uint       y            = 0;
+    uint       batch_id     = 0;
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, y);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel + (uint)0, K, group_id0, channel_id0);
+    DIV_MOD_UINT(curr_channel + (uint)1, K, group_id1, channel_id1);
+    DIV_MOD_UINT(curr_channel + (uint)2, K, group_id2, channel_id2);
+    DIV_MOD_UINT(curr_channel + (uint)3, K, group_id3, channel_id3);
+
+    const uint x  = get_global_id(1) * 2;
+    const uint z0 = channel_id0 * (uint)NUM_GROUPS + group_id0;
+    const uint z1 = channel_id1 * (uint)NUM_GROUPS + group_id1;
+    const uint z2 = channel_id2 * (uint)NUM_GROUPS + group_id2;
+    const uint z3 = channel_id3 * (uint)NUM_GROUPS + group_id3;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + curr_channel * sizeof(DATA_TYPE) + x * src_stride_y + y * src_stride_z + batch_id * src_stride_w;
+    TYPE u0                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    TYPE u1                         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr                                                              = dst_ptr + dst_offset_first_element_in_bytes + x * dst_stride_y + y * dst_stride_z + batch_id * dst_stride_w;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u0.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u0.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u0.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)0 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u0.s3;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z0 * sizeof(DATA_TYPE))) = u1.s0;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z1 * sizeof(DATA_TYPE))) = u1.s1;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z2 * sizeof(DATA_TYPE))) = u1.s2;
+    *((__global DATA_TYPE *)(output_ptr + (uint)1 * dst_stride_y + z3 * sizeof(DATA_TYPE))) = u1.s3;
+}
+#endif // VEC_SIZE == 4 && defined(LAST_ACCESSED)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)

diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index 5e52127..b02d07b 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl

@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
 
 #if ELEMENT_SIZE == 1
 #define COND_DATA_TYPE char
@@ -41,7 +41,7 @@
  * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
  * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
  * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
- * @note In case of grouping the GROUPING flag must be passed at compile time using -DGROUPING
+ * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -58,15 +58,16 @@
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void col2im(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint dst_stride_w)
+    TENSOR4D_DECLARATION(dst))
 {
     Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
 
     const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
     const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
@@ -86,27 +87,25 @@
     // If out-of-bound, overwrite with the first element
     data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
 
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes;
+#if NUM_GROUPS > 1
+    // Compute output offset (batches on 4th dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
 
-#if defined(GROUPING)
-    // Compute output offset (batches on 4th dimension, no need to compute manually)
-    int idx = yd * dst_stride_y + xd * dst_stride_x;
-
-    const uint group = get_global_id(2); // group ID
+    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
     x_clamped += group * WIDTH_INPUT;
-#else  /* defined(GROUPING) */
+#else  /* defined(NUM_GROUPS > 1 ) */
     // Compute output offset (batches on 3rd dimension)
-    int idx = yd * dst_stride_y + xd * dst_stride_x + get_global_id(2) * dst_stride_w;
-#endif /* GROUPING */
+    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
 
     // Store value
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s0 * dst_stride_z)) = data.s0;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s1 * dst_stride_z)) = data.s1;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s2 * dst_stride_z)) = data.s2;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s3 * dst_stride_z)) = data.s3;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s4 * dst_stride_z)) = data.s4;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s5 * dst_stride_z)) = data.s5;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s6 * dst_stride_z)) = data.s6;
-    *((__global DATA_TYPE *)(output_ptr + idx + x_clamped.s7 * dst_stride_z)) = data.s7;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
 }
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT)
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)

diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index 02a0c8e..7a872b4 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl

@@ -64,6 +64,54 @@
     vstore16(rgba_3, 0, out.ptr + 48);
 }
 
+/** Convert an RGB888 image to U8
+ *
+ * Global Workgroup Size [ DIV_CEIL(width, 16), height ]
+ * No offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported Format: RGB888
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported Format: U8
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void RGB888_to_U8_bt709(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT(input);
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    // handle 16 pixels every time
+    const uchar16 rgb_0 = vload16(0, in.ptr);
+    const uchar16 rgb_1 = vload16(0, in.ptr + 16);
+    const uchar16 rgb_2 = vload16(0, in.ptr + 32);
+
+    //Resequence values from a sequence of 16 RGB values to sequence of 16 R, 16 G, 16 B values
+    const uchar16 rgb_r = (uchar16)(rgb_0.s0369, rgb_0.scf, rgb_1.s258b, rgb_1.se, rgb_2.s147a, rgb_2.sd);
+    const uchar16 rgb_g = (uchar16)(rgb_0.s147a, rgb_0.sd, rgb_1.s0369, rgb_1.scf, rgb_2.s258b, rgb_2.se);
+    const uchar16 rgb_b = (uchar16)(rgb_0.s258b, rgb_0.se, rgb_1.s147a, rgb_1.sd, rgb_2.s0369, rgb_2.scf);
+
+    const float16 rgb2u8_red_coef_bt709   = 0.2126f;
+    const float16 rgb2u8_green_coef_bt709 = 0.7152f;
+    const float16 rgb2u8_blue_coef_bt709  = 0.0722f;
+
+    //Computation of 16 greyscale values in float
+    const float16 greyscale_f_0 = rgb2u8_red_coef_bt709 * convert_float16(rgb_r) + rgb2u8_green_coef_bt709 * convert_float16(rgb_g) + rgb2u8_blue_coef_bt709 * convert_float16(rgb_b);
+
+    //Convert it to 16 grayscale uchar values
+    const uchar16 greyscale_u8_0 = convert_uchar16_sat_rtz(greyscale_f_0);
+
+    vstore16(greyscale_u8_0, 0, out.ptr);
+}
+
 /** Convert an RGB888 image to RGBX8888
  *
  * Global Workgroup Size [ DIV_CEIL(width, 16), height ]

diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 16c4363..0e8805f 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl

@@ -23,12 +23,220 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE)
-#if defined(WIDTH_OFFSET)
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+
+#if defined(INPUT1_WIDTH)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not supported"
+#endif // ELEMENT_SIZE
+
+#if VEC_SIZE == 2
+#define SEQ ((int2)(0, 1))
+#elif VEC_SIZE == 4
+#define SEQ ((int4)(0, 1, 2, 3))
+#elif VEC_SIZE == 8
+#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
+#elif VEC_SIZE == 16
+#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
+#else // VEC_SIZE
+#error "Vector size not supported"
+#endif // VEC_SIZE
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x  = get_global_id(0) * (int)VEC_SIZE;
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH);
+    const int x2 = max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords        = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values    = select(src2_values, src1_values, cond);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
+
+    // Calculate input indices
+    const int x = get_global_id(0) * (int)VEC_SIZE;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)VEC_SIZE), (int)INPUT2_WIDTH);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)VEC_SIZE), (int)INPUT3_WIDTH);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)VEC_SIZE);
+
+    // Calculate inputs and output addresses
+    const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
+
+    const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
+
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+    const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    values = select(src2_values, src1_values, cond_in2);
+    values = select(src3_values, values, cond_in3);
+    values = select(src4_values, values, cond_in4);
+
+    VSTORE(VEC_SIZE)
+    (values, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(DEPTH)
 /** This kernel concatenates the input tensor into the output tensor along the first dimension
  *
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -37,6 +245,8 @@
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -45,15 +255,16 @@
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  offset                            The offset to the first valid element of the output tensor in bytes
  */
 __kernel void concatenate_width(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
@@ -61,10 +272,13 @@
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
 }
-#endif // defined(WIDTH_OFFSET)
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
 
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -97,4 +311,4 @@
     VSTORE(VEC_SIZE)
     (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z));
 }
-#endif // defined(DATA_TYPE)
\ No newline at end of file
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */

diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 930a676..4bbbf11 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl

@@ -23,6 +23,60 @@
  */
 #include "helpers.h"
 
+#if defined(PAD00) && defined(PAD10) && defined(PAD20) && defined(PAD21) && defined(PAD30) && defined(DATA_TYPE) && defined(VEC_SIZE) // Compile time constants
+
+/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DPAD{d}{0,1} = padding before{0} and after{1} dimension d (d < 4)
+ * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
+ * -# -DDATA_TYPE = Input and output datatypes.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_pad_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int offset_x = PAD00;
+    const int offset_y = PAD10;
+    const int offset_z = PAD20;
+
+#if PAD30 > 0
+    const size_t in_batch    = get_global_id(2) / DEPTH;
+    const int    total_depth = DEPTH + PAD20 + PAD21;
+    const int    offset_w    = PAD30 * total_depth + in_batch * (PAD20 + PAD21);
+#else  // PAD30 == 0
+    const int offset_w = 0;
+#endif // PAD30
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor3D_offset(&out, offset_x, offset_y, offset_z + offset_w));
+}
+#endif // Compile time constants
+
 /** Performs a copy of input tensor to the output tensor.
  *
  * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -56,4 +110,4 @@
     // Store result
     VSTORE(VEC_SIZE)
     (data, 0, (__global DATA_TYPE *)out.ptr);
-}
\ No newline at end of file
+}

diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index e15482c..e5169f9 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl

@@ -25,7 +25,7 @@
 
 /** This function applies upsample on an input image.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: QASYMM8/F16/F32
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -33,7 +33,7 @@
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F16/F32
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index 77a76b6..bfaa92b 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -24,7 +24,7 @@
 
 #include "helpers.h"
 
-#if defined(DEPTH_MULTIPLIER)
+#if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 #if defined(CONV_STRIDE_X)
 
 #if CONV_STRIDE_X == 1
@@ -147,12 +147,12 @@
 
 /** This OpenCL kernel computes the depthwise convolution 3x3
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
@@ -188,23 +188,28 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 #endif //defined(HAS_BIAS)
 
-    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
 
     uchar3 offset          = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
-    float3 weights_values0 = vload3(0, (__global float *)(weights.ptr + offset.s0));
-    float3 weights_values1 = vload3(0, (__global float *)(weights.ptr + offset.s1));
-    float3 weights_values2 = vload3(0, (__global float *)(weights.ptr + offset.s2));
+    float3 weights_values0 = vload3(0, (__global float *)(weights_addr + offset.s0));
+    float3 weights_values1 = vload3(0, (__global float *)(weights_addr + offset.s1));
+    float3 weights_values2 = vload3(0, (__global float *)(weights_addr + offset.s2));
 
     float2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                    weights_values2.s0, weights_values2.s1, weights_values2.s2);
 #if defined(HAS_BIAS)
-    pixels += (float2)(*((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+    pixels += (float2)(*((__global float *)(biases.ptr + channel * biases_stride_x)));
 #endif //defined(HAS_BIAS)
 
     vstore2(pixels, 0, (__global float *)dst.ptr);
@@ -266,12 +271,12 @@
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
  * stride_x and stride_y are equal to 1
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
@@ -307,15 +312,19 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
 
     float2 pixels0 = 0.0f;
     float2 pixels1 = 0.0f;
     float2 pixels2 = 0.0f;
     float2 pixels3 = 0.0f;
 
-    __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -346,7 +355,7 @@
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
-    float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+    float bias = *((__global float *)(vector_offset(&biases, channel)));
 
     pixels0 += (float2)bias;
     pixels1 += (float2)bias;
@@ -363,12 +372,12 @@
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both
  * stride_x and stride_y are equal to 2
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: F32
@@ -404,13 +413,17 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
 
     float2 pixels0 = 0.0f;
     float2 pixels1 = 0.0f;
 
-    __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = src.ptr - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
     // Load the weights
     float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
@@ -439,7 +452,7 @@
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
-    float bias = *((__global float *)(vector_offset(&biases, get_global_id(2))));
+    float bias = *((__global float *)(vector_offset(&biases, channel)));
 
     pixels0 += (float2)bias;
     pixels1 += (float2)bias;
@@ -449,7 +462,7 @@
     vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
 }
 
-#endif // defined(DEPTH_MULTIPLIER)
+#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 
 #if defined(NCHW)
 #define in_stride_x src_stride_x
@@ -617,7 +630,7 @@
 
 #endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 #if defined(CONV_STRIDE_X)
 #if CONV_STRIDE_X == 1
 #define convolution1x3_f16 convolution1x3_stride_1_f16
@@ -740,14 +753,14 @@
 
 /** This OpenCL kernel computes the depthwise convolution 3x3
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -781,23 +794,28 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 #endif //defined(HAS_BIAS)
 
-    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
 
     uchar3 offset         = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;
-    half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));
-    half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));
-    half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));
+    half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));
+    half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));
+    half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));
 
     half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                       weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                       weights_values2.s0, weights_values2.s1, weights_values2.s2);
 #if defined(HAS_BIAS)
-    pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));
+    pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));
 #endif //defined(HAS_BIAS)
 
     vstore4(pixels, 0, (__global half *)dst.ptr);
@@ -808,14 +826,14 @@
 /** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3
  * when both stride_x and stride_y are equal to 1
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -849,12 +867,16 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
 
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
-    half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+    half bias = *((__global half *)(vector_offset(&biases, channel)));
 #endif /* defined(HAS_BIAS) */
 
     half4 pixels0 = 0.0f;
@@ -862,8 +884,9 @@
     half4 pixels2 = 0.0f;
     half4 pixels3 = 0.0f;
 
-    __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
     // Load the weights
     half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
@@ -907,14 +930,14 @@
 /** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3
  * when both stride_x and stride_y are equal to 2
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: F16
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -948,19 +971,24 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
 
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
-    half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));
+    half bias = *((__global half *)(vector_offset(&biases, channel)));
 #endif /* defined(HAS_BIAS) */
 
     half4 pixels0 = 0.0f;
     half4 pixels1 = 0.0f;
 
-    __global uchar *weights_addr = (__global uchar *)weights.ptr;
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
+    __global uchar *src_addr     = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
 
     // Load the weights
     half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));
@@ -994,15 +1022,20 @@
     vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));
     vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));
 }
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 
-#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
+#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
 
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#if DATA_TYPE != float || DATA_TYPE != half
+#error "Unsupported data type"
+#endif // DATA_TYPE != float || DATA_TYPE != half
+
+#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 /** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.
  *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
  * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
@@ -1010,14 +1043,16 @@
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1025,6 +1060,8 @@
  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -1041,8 +1078,8 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 __kernel void depthwise_convolution_3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
@@ -1051,11 +1088,20 @@
 {
     int x = get_global_id(0); // channels
     int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
     int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float) * VEC_SIZE;
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
 
     int  z_coord  = 0;
     int4 offset   = 0;
@@ -1065,15 +1111,15 @@
     VEC_FLOAT acc = 0;
 
     // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
 
     // Load input values
     // z == 0
@@ -1085,27 +1131,27 @@
     offset  = y_offset + (int4)(z_coord * src_stride_z);
     offset  = min(offset, (int4)max_offset);
 
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
 
     // z == 1
     // z_coord can be only negative for z = 0 so we do not need to clamp it
     // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
     z_coord           = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + 1;
     offset            = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
 
     // z == 2
     // After z = 1 we can simply add src_stride_z to offset without updating z_coord
     // However offset can be out-of-bound so we need to check if it is greater than max_offset
     offset += (int4)src_stride_z;
     offset            = min(offset, (int4)max_offset);
-    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
+    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
 
     acc = fma(values0, w0, acc);
     acc = fma(values1, w1, acc);
@@ -1121,13 +1167,18 @@
 
 #if defined(HAS_BIAS)
     Vector    biases      = CONVERT_TO_VECTOR_STRUCT(biases);
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global float *)biases.ptr);
+    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
     acc += bias_values;
 #endif // defined(HAS_BIAS)
 
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
     VSTORE(VEC_SIZE)
-    (acc, 0, (__global float *)(dst.ptr));
+    (acc, 0, (__global DATA_TYPE *)(dst_addr));
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
@@ -1141,14 +1192,16 @@
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: FP32
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: FP32
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1156,6 +1209,8 @@
  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -1172,8 +1227,8 @@
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
  */
 __kernel void depthwise_convolution_3x3_nhwc_stride1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
@@ -1182,11 +1237,20 @@
 {
     int x = get_global_id(0); // channels
     int y = get_global_id(1); // spatial coordinate x
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
     int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float) * VEC_SIZE;
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
 
     int  z_coord  = 0;
     int4 offset   = 0;
@@ -1199,15 +1263,15 @@
     VEC_FLOAT acc3 = 0;
 
     // Load weights
-    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
-    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
-    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
-    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global float *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));
+    VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));
+    VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));
+    VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));
 
     // Load input values
     // z == 0
@@ -1219,40 +1283,40 @@
     offset  = y_offset + (int4)(z_coord * src_stride_z);
     offset  = min(offset, (int4)max_offset);
 
-    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
-    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+    VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
 
     // z == 1
     // z_coord can be only negative for z = 0 so we do not need to clamp it
     // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
     z_coord           = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
     offset            = y_offset + (int4)(z_coord * src_stride_z);
-    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
-    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+    VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
 
     // z == 2
     // After z = 1 we can simply add src_stride_z to offset without updating z_coord
     // However offset can be out-of-bound so we need to check if it is greater than max_offset
     offset += (int4)src_stride_z;
     offset             = min(offset, (int4)max_offset);
-    VEC_FLOAT values8  = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values9  = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
-    VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+    VEC_FLOAT values8  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values9  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
 
     // z == 3
     // After z = 1 we can simply add src_stride_z to offset without updating z_coord
     // However offset can be out-of-bound so we need to check if it is greater than max_offset
     offset += (int4)src_stride_z;
     offset             = min(offset, (int4)max_offset);
-    VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s0));
-    VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s1));
-    VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s2));
-    VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global float *)(src_addr + offset.s3));
+    VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));
+    VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));
+    VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));
+    VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));
 
     acc0 = fma(values0, w0, acc0);
     acc0 = fma(values1, w1, acc0);
@@ -1299,7 +1363,7 @@
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
 
-    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global float *)biases.ptr);
+    VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
 
     acc0 += bias_values;
     acc1 += bias_values;
@@ -1307,23 +1371,27 @@
     acc3 += bias_values;
 #endif // defined(HAS_BIAS)
 
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
 
     VSTORE(VEC_SIZE)
-    (acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    (acc0, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
     VSTORE(VEC_SIZE)
-    (acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+    (acc1, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 
 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
 #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     {
         VSTORE(VEC_SIZE)
-        (acc2, 0, (__global float *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
+        (acc2, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));
         VSTORE(VEC_SIZE)
-        (acc3, 0, (__global float *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
+        (acc3, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));
     }
 }
 
 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
-#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index fe902ed..5a732b4 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -24,7 +24,7 @@
 
 #include "helpers_asymm.h"
 
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
 #if defined(FUSED_ACTIVATION)
 #define DATA_TYPE uchar
@@ -37,21 +37,21 @@
 #define ACTIVATION_FUNC(x) (x)
 #endif /* defined(FUSED_ACTIVATION) */
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER)
+#if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
 
 #if CONV_STRIDE_X > 3
 #error "Stride X not supported"
 #endif /* CONV_STRIDE_X > 3 */
 
-#if !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
 
 #if CONV_STRIDE_X == 1
 #define GET_VALUES(first_value, left, middle, right)                              \
@@ -87,14 +87,14 @@
 
 /** This function computes the depthwise convolution quantized.
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -129,18 +129,25 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
 
-    int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
+    int bias_value = *((__global int *)(vector_offset(&biases, channel)));
 #endif //defined(HAS_BIAS)
 
-    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
 
-    uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
-    uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
-    uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
+    uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
+    uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
+    uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
 
     int8 values0 = 0;
     int8 sum0    = 0;
@@ -241,7 +248,16 @@
 #endif /* CONV_STRIDE_Y == 1 */
 #endif /* K_OFFSET != 0 */
 
+#if defined(REAL_MULTIPLIER)
+
+    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
     values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
     values0 += (int8)OUTPUT_OFFSET;
     uchar8 res0 = convert_uchar8_sat(values0);
     res0        = max(res0, (uchar8)0);
@@ -249,8 +265,16 @@
 
     vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
 #if CONV_STRIDE_Y == 1
+#if defined(REAL_MULTIPLIER)
+
+    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
 
     values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
     values1 += (int8)OUTPUT_OFFSET;
     uchar8 res1 = convert_uchar8_sat(values1);
     res1        = max(res1, (uchar8)0);
@@ -260,7 +284,7 @@
 #endif /* CONV_STRIDE_Y == 1 */
 }
 
-#else // !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8))
 
 #if CONV_STRIDE_X == 1
 #define GET_VALUES(first_value, left, middle, right)                 \
@@ -295,14 +319,14 @@
 #endif /* CONV_STRIDE_X */
 /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -337,18 +361,25 @@
 {
     Image    src     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
     Image    dst     = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);
-#if defined(HAS_BIAS)
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
 
-    const int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
+    // Extract channel and linearized batch indices
+    const int channel = get_global_id(2) % DST_CHANNELS;
+    const int batch   = get_global_id(2) / DST_CHANNELS;
+
+#if defined(HAS_BIAS)
+    Vector    biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
 #endif //defined(HAS_BIAS)
 
-    src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;
+    // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
+    src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
+    __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
 
-    uchar3 w0 = vload3(0, weights.ptr + 0 * weights_stride_y);
-    uchar3 w1 = vload3(0, weights.ptr + 1 * weights_stride_y);
-    uchar3 w2 = vload3(0, weights.ptr + 2 * weights_stride_y);
+    uchar3 w0 = vload3(0, weights_addr + 0 * weights_stride_y);
+    uchar3 w1 = vload3(0, weights_addr + 1 * weights_stride_y);
+    uchar3 w2 = vload3(0, weights_addr + 2 * weights_stride_y);
 
     uchar8 left0, middle0, right0;
     uchar8 left1, middle1, right1;
@@ -383,69 +414,69 @@
 #endif /* WEIGHTS_OFFSET != 0 */
 #endif // CONV_STRIDE_Y == 1
 
-    ARM_DOT(left0.s0, middle0.s0, right0.s0, left1.s0, w0.s0, w0.s1, w0.s2, w1.s0, values0.s0);
-    ARM_DOT(middle1.s0, right1.s0, left2.s0, middle2.s0, w1.s1, w1.s2, w2.s0, w2.s1, values0.s0);
+    ARM_DOT((uchar4)(left0.s0, middle0.s0, right0.s0, left1.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
+    ARM_DOT((uchar4)(middle1.s0, right1.s0, left2.s0, middle2.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
     values0.s0 += right2.s0 * w2.s2;
 
-    ARM_DOT(left0.s1, middle0.s1, right0.s1, left1.s1, w0.s0, w0.s1, w0.s2, w1.s0, values0.s1);
-    ARM_DOT(middle1.s1, right1.s1, left2.s1, middle2.s1, w1.s1, w1.s2, w2.s0, w2.s1, values0.s1);
+    ARM_DOT((uchar4)(left0.s1, middle0.s1, right0.s1, left1.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
+    ARM_DOT((uchar4)(middle1.s1, right1.s1, left2.s1, middle2.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
     values0.s1 += right2.s1 * w2.s2;
 
-    ARM_DOT(left0.s2, middle0.s2, right0.s2, left1.s2, w0.s0, w0.s1, w0.s2, w1.s0, values0.s2);
-    ARM_DOT(middle1.s2, right1.s2, left2.s2, middle2.s2, w1.s1, w1.s2, w2.s0, w2.s1, values0.s2);
+    ARM_DOT((uchar4)(left0.s2, middle0.s2, right0.s2, left1.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
+    ARM_DOT((uchar4)(middle1.s2, right1.s2, left2.s2, middle2.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
     values0.s2 += right2.s2 * w2.s2;
 
-    ARM_DOT(left0.s3, middle0.s3, right0.s3, left1.s3, w0.s0, w0.s1, w0.s2, w1.s0, values0.s3);
-    ARM_DOT(middle1.s3, right1.s3, left2.s3, middle2.s3, w1.s1, w1.s2, w2.s0, w2.s1, values0.s3);
+    ARM_DOT((uchar4)(left0.s3, middle0.s3, right0.s3, left1.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
+    ARM_DOT((uchar4)(middle1.s3, right1.s3, left2.s3, middle2.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
     values0.s3 += right2.s3 * w2.s2;
 
-    ARM_DOT(left0.s4, middle0.s4, right0.s4, left1.s4, w0.s0, w0.s1, w0.s2, w1.s0, values0.s4);
-    ARM_DOT(middle1.s4, right1.s4, left2.s4, middle2.s4, w1.s1, w1.s2, w2.s0, w2.s1, values0.s4);
+    ARM_DOT((uchar4)(left0.s4, middle0.s4, right0.s4, left1.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
+    ARM_DOT((uchar4)(middle1.s4, right1.s4, left2.s4, middle2.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
     values0.s4 += right2.s4 * w2.s2;
 
-    ARM_DOT(left0.s5, middle0.s5, right0.s5, left1.s5, w0.s0, w0.s1, w0.s2, w1.s0, values0.s5);
-    ARM_DOT(middle1.s5, right1.s5, left2.s5, middle2.s5, w1.s1, w1.s2, w2.s0, w2.s1, values0.s5);
+    ARM_DOT((uchar4)(left0.s5, middle0.s5, right0.s5, left1.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
+    ARM_DOT((uchar4)(middle1.s5, right1.s5, left2.s5, middle2.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
     values0.s5 += right2.s5 * w2.s2;
 
-    ARM_DOT(left0.s6, middle0.s6, right0.s6, left1.s6, w0.s0, w0.s1, w0.s2, w1.s0, values0.s6);
-    ARM_DOT(middle1.s6, right1.s6, left2.s6, middle2.s6, w1.s1, w1.s2, w2.s0, w2.s1, values0.s6);
+    ARM_DOT((uchar4)(left0.s6, middle0.s6, right0.s6, left1.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
+    ARM_DOT((uchar4)(middle1.s6, right1.s6, left2.s6, middle2.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
     values0.s6 += right2.s6 * w2.s2;
 
-    ARM_DOT(left0.s7, middle0.s7, right0.s7, left1.s7, w0.s0, w0.s1, w0.s2, w1.s0, values0.s7);
-    ARM_DOT(middle1.s7, right1.s7, left2.s7, middle2.s7, w1.s1, w1.s2, w2.s0, w2.s1, values0.s7);
+    ARM_DOT((uchar4)(left0.s7, middle0.s7, right0.s7, left1.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
+    ARM_DOT((uchar4)(middle1.s7, right1.s7, left2.s7, middle2.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
     values0.s7 += right2.s7 * w2.s2;
 
 #if CONV_STRIDE_Y == 1
-    ARM_DOT(left1.s0, middle1.s0, right1.s0, left2.s0, w0.s0, w0.s1, w0.s2, w1.s0, values1.s0);
-    ARM_DOT(middle2.s0, right2.s0, left3.s0, middle3.s0, w1.s1, w1.s2, w2.s0, w2.s1, values1.s0);
+    ARM_DOT((uchar4)(left1.s0, middle1.s0, right1.s0, left2.s0), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
+    ARM_DOT((uchar4)(middle2.s0, right2.s0, left3.s0, middle3.s0), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
     values1.s0 += right3.s0 * w2.s2;
 
-    ARM_DOT(left1.s1, middle1.s1, right1.s1, left2.s1, w0.s0, w0.s1, w0.s2, w1.s0, values1.s1);
-    ARM_DOT(middle2.s1, right2.s1, left3.s1, middle3.s1, w1.s1, w1.s2, w2.s0, w2.s1, values1.s1);
+    ARM_DOT((uchar4)(left1.s1, middle1.s1, right1.s1, left2.s1), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
+    ARM_DOT((uchar4)(middle2.s1, right2.s1, left3.s1, middle3.s1), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
     values1.s1 += right3.s1 * w2.s2;
 
-    ARM_DOT(left1.s2, middle1.s2, right1.s2, left2.s2, w0.s0, w0.s1, w0.s2, w1.s0, values1.s2);
-    ARM_DOT(middle2.s2, right2.s2, left3.s2, middle3.s2, w1.s1, w1.s2, w2.s0, w2.s1, values1.s2);
+    ARM_DOT((uchar4)(left1.s2, middle1.s2, right1.s2, left2.s2), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
+    ARM_DOT((uchar4)(middle2.s2, right2.s2, left3.s2, middle3.s2), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
     values1.s2 += right3.s2 * w2.s2;
 
-    ARM_DOT(left1.s3, middle1.s3, right1.s3, left2.s3, w0.s0, w0.s1, w0.s2, w1.s0, values1.s3);
-    ARM_DOT(middle2.s3, right2.s3, left3.s3, middle3.s3, w1.s1, w1.s2, w2.s0, w2.s1, values1.s3);
+    ARM_DOT((uchar4)(left1.s3, middle1.s3, right1.s3, left2.s3), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
+    ARM_DOT((uchar4)(middle2.s3, right2.s3, left3.s3, middle3.s3), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
     values1.s3 += right3.s3 * w2.s2;
 
-    ARM_DOT(left1.s4, middle1.s4, right1.s4, left2.s4, w0.s0, w0.s1, w0.s2, w1.s0, values1.s4);
-    ARM_DOT(middle2.s4, right2.s4, left3.s4, middle3.s4, w1.s1, w1.s2, w2.s0, w2.s1, values1.s4);
+    ARM_DOT((uchar4)(left1.s4, middle1.s4, right1.s4, left2.s4), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
+    ARM_DOT((uchar4)(middle2.s4, right2.s4, left3.s4, middle3.s4), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
     values1.s4 += right3.s4 * w2.s2;
 
-    ARM_DOT(left1.s5, middle1.s5, right1.s5, left2.s5, w0.s0, w0.s1, w0.s2, w1.s0, values1.s5);
-    ARM_DOT(middle2.s5, right2.s5, left3.s5, middle3.s5, w1.s1, w1.s2, w2.s0, w2.s1, values1.s5);
+    ARM_DOT((uchar4)(left1.s5, middle1.s5, right1.s5, left2.s5), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
+    ARM_DOT((uchar4)(middle2.s5, right2.s5, left3.s5, middle3.s5), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
     values1.s5 += right3.s5 * w2.s2;
 
-    ARM_DOT(left1.s6, middle1.s6, right1.s6, left2.s6, w0.s0, w0.s1, w0.s2, w1.s0, values1.s6);
-    ARM_DOT(middle2.s6, right2.s6, left3.s6, middle3.s6, w1.s1, w1.s2, w2.s0, w2.s1, values1.s6);
+    ARM_DOT((uchar4)(left1.s6, middle1.s6, right1.s6, left2.s6), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
+    ARM_DOT((uchar4)(middle2.s6, right2.s6, left3.s6, middle3.s6), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
     values1.s6 += right3.s6 * w2.s2;
 
-    ARM_DOT(left1.s7, middle1.s7, right1.s7, left2.s7, w0.s0, w0.s1, w0.s2, w1.s0, values1.s7);
-    ARM_DOT(middle2.s7, right2.s7, left3.s7, middle3.s7, w1.s1, w1.s2, w2.s0, w2.s1, values1.s7);
+    ARM_DOT((uchar4)(left1.s7, middle1.s7, right1.s7, left2.s7), (uchar4)(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
+    ARM_DOT((uchar4)(middle2.s7, right2.s7, left3.s7, middle3.s7), (uchar4)(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
     values1.s7 += right3.s7 * w2.s2;
 #endif // CONV_STRIDE_Y == 1
 
@@ -480,7 +511,16 @@
 #endif /* CONV_STRIDE_Y == 1 */
 #endif /* K_OFFSET != 0 */
 
+#if defined(REAL_MULTIPLIER)
+
+    values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
     values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
     values0 += (int8)OUTPUT_OFFSET;
     uchar8 res0 = convert_uchar8_sat(values0);
     res0        = max(res0, (uchar8)0);
@@ -489,7 +529,16 @@
     vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
 #if CONV_STRIDE_Y == 1
 
+#if defined(REAL_MULTIPLIER)
+
+    values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
+
+#else // defined(REAL_MULTIPLIER)
+
     values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+
+#endif // defined(REAL_MULTIPLIER)
+
     values1 += (int8)OUTPUT_OFFSET;
     uchar8 res1 = convert_uchar8_sat(values1);
     res1        = max(res1, (uchar8)0);
@@ -499,15 +548,16 @@
 #endif /* CONV_STRIDE_Y == 1 */
 }
 
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) */
+#endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
 
 #if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
 
 #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
 
 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
 #define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
 #define VEC_USHORT VEC_DATA_TYPE(ushort, VEC_SIZE)
 
@@ -523,37 +573,66 @@
 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
 #endif /* WEIGHTS_OFFSET != 0 */
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
     ({                                                                                                             \
-        ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0);                           \
-        ARM_DOT(val4.s0, val5.s0, val6.s0, val7.s0, w4.s0, w5.s0, w6.s0, w7.s0, acc.s0);                           \
+        ARM_DOT((uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), (uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), acc.s0);       \
+        ARM_DOT((uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), (uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), acc.s0);       \
         acc.s0 += val8.s0 * w8.s0;                                                                                 \
         \
-        ARM_DOT(val0.s1, val1.s1, val2.s1, val3.s1, w0.s1, w1.s1, w2.s1, w3.s1, acc.s1);                           \
-        ARM_DOT(val4.s1, val5.s1, val6.s1, val7.s1, w4.s1, w5.s1, w6.s1, w7.s1, acc.s1);                           \
+        ARM_DOT((uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), (uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), acc.s1);       \
+        ARM_DOT((uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), (uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), acc.s1);       \
         acc.s1 += val8.s1 * w8.s1;                                                                                 \
         \
-        ARM_DOT(val0.s2, val1.s2, val2.s2, val3.s2, w0.s2, w1.s2, w2.s2, w3.s2, acc.s2);                           \
-        ARM_DOT(val4.s2, val5.s2, val6.s2, val7.s2, w4.s2, w5.s2, w6.s2, w7.s2, acc.s2);                           \
+        ARM_DOT((uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), (uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), acc.s2);       \
+        ARM_DOT((uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), (uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), acc.s2);       \
         acc.s2 += val8.s2 * w8.s2;                                                                                 \
         \
-        ARM_DOT(val0.s3, val1.s3, val2.s3, val3.s3, w0.s3, w1.s3, w2.s3, w3.s3, acc.s3);                           \
-        ARM_DOT(val4.s3, val5.s3, val6.s3, val7.s3, w4.s3, w5.s3, w6.s3, w7.s3, acc.s3);                           \
+        ARM_DOT((uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), (uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), acc.s3);       \
+        ARM_DOT((uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), (uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), acc.s3);       \
         acc.s3 += val8.s3 * w8.s3;                                                                                 \
     })
 
 #if WEIGHTS_OFFSET != 0
-#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)                                                                                                             \
-    ({                                                                                                                                                                                                                                         \
-        sum += CONVERT(val0, VEC_INT) + CONVERT(val1, VEC_INT) + CONVERT(val2, VEC_INT) + CONVERT(val3, VEC_INT) + CONVERT(val4, VEC_INT) + CONVERT(val5, VEC_INT) + CONVERT(val6, VEC_INT) + CONVERT(val7, VEC_INT) + CONVERT(val8, VEC_INT); \
-        DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8);                                                                                                                            \
+#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \
+    ({                                                                                                                        \
+        ARM_DOT((uchar4)(w0.s0, w1.s0, w2.s0, w3.s0), (uchar4)(val0.s0, val1.s0, val2.s0, val3.s0), acc.s0);                  \
+        ARM_DOT((uchar4)(w4.s0, w5.s0, w6.s0, w7.s0), (uchar4)(val4.s0, val5.s0, val6.s0, val7.s0), acc.s0);                  \
+        ARM_DOT((uchar4)(w8.s0, 0, 0, 0), (uchar4)val8.s0, acc.s0);                                                           \
+        \
+        ARM_DOT((uchar4)(w0.s1, w1.s1, w2.s1, w3.s1), (uchar4)(val0.s1, val1.s1, val2.s1, val3.s1), acc.s1);                  \
+        ARM_DOT((uchar4)(w4.s1, w5.s1, w6.s1, w7.s1), (uchar4)(val4.s1, val5.s1, val6.s1, val7.s1), acc.s1);                  \
+        ARM_DOT((uchar4)(w8.s1, 0, 0, 0), (uchar4)val8.s1, acc.s1);                                                           \
+        \
+        ARM_DOT((uchar4)(w0.s2, w1.s2, w2.s2, w3.s2), (uchar4)(val0.s2, val1.s2, val2.s2, val3.s2), acc.s2);                  \
+        ARM_DOT((uchar4)(w4.s2, w5.s2, w6.s2, w7.s2), (uchar4)(val4.s2, val5.s2, val6.s2, val7.s2), acc.s2);                  \
+        ARM_DOT((uchar4)(w8.s2, 0, 0, 0), (uchar4)val8.s2, acc.s2);                                                           \
+        \
+        ARM_DOT((uchar4)(w0.s3, w1.s3, w2.s3, w3.s3), (uchar4)(val0.s3, val1.s3, val2.s3, val3.s3), acc.s3);                  \
+        ARM_DOT((uchar4)(w4.s3, w5.s3, w6.s3, w7.s3), (uchar4)(val4.s3, val5.s3, val6.s3, val7.s3), acc.s3);                  \
+        ARM_DOT((uchar4)(w8.s3, 0, 0, 0), (uchar4)val8.s3, acc.s3);                                                           \
     })
 #else /* WEIGHTS_OFFSET != 0 */
-#define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
+#define DOT_PRODUCT_ACCUMULATE(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8)
 #endif /* WEIGHTS_OFFSET != 0 */
 
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
+    ({                                                                                   \
+        sum = CONVERT(val0, VEC_INT);                                                    \
+        ARM_DOT((uchar4)(val1.s0, val2.s0, val3.s0, val4.s0), (uchar4)1, sum.s0);        \
+        ARM_DOT((uchar4)(val5.s0, val6.s0, val7.s0, val8.s0), (uchar4)1, sum.s0);        \
+        \
+        ARM_DOT((uchar4)(val1.s1, val2.s1, val3.s1, val4.s1), (uchar4)1, sum.s1);        \
+        ARM_DOT((uchar4)(val5.s1, val6.s1, val7.s1, val8.s1), (uchar4)1, sum.s1);        \
+        \
+        ARM_DOT((uchar4)(val1.s2, val2.s2, val3.s2, val4.s2), (uchar4)1, sum.s2);        \
+        ARM_DOT((uchar4)(val5.s2, val6.s2, val7.s2, val8.s2), (uchar4)1, sum.s2);        \
+        \
+        ARM_DOT((uchar4)(val1.s3, val2.s3, val3.s3, val4.s3), (uchar4)1, sum.s3);        \
+        ARM_DOT((uchar4)(val5.s3, val6.s3, val7.s3, val8.s3), (uchar4)1, sum.s3);        \
+    })
+
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.
@@ -565,14 +644,16 @@
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -580,6 +661,8 @@
  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -596,8 +679,8 @@
  * @param[in] max_offset                            Max offset for the input tensor
  */
 __kernel void depthwise_convolution_3x3_quantized_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
@@ -606,17 +689,34 @@
 {
     const int x = get_global_id(0); // channels
     const int y = get_global_id(1); // spatial coordinate x
-    const int z = get_global_id(2); // spatial coordinate y
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else  /* defined(DST_DEPTH) */
+    int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
 
-    int        z_coord  = 0;
-    int4       offset   = 0;
-    const int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
 
-    // We compute 2x1x1 [C,W,H] elements
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute 4x1x1 [C,W,H] elements
     VEC_INT acc = 0, sum = 0;
 
     // Load weights
@@ -698,15 +798,28 @@
     acc += (VEC_INT)K_OFFSET;
 #endif /* K_OFFSET != 0 */
 
+#if defined(REAL_MULTIPLIER)
+
+    acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else  // defined(REAL_MULTIPLIER)
+
     acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // defined(REAL_MULTIPLIER)
+
     acc += (VEC_INT)OUTPUT_OFFSET;
 
     VEC_UCHAR res = CONVERT_SAT(acc, VEC_UCHAR);
     res           = CLAMP(res, (VEC_UCHAR)0, (VEC_UCHAR)255);
 
-    Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
+
     VSTORE(VEC_SIZE)
-    (res, 0, dst.ptr);
+    (ACTIVATION_FUNC(res), 0, dst_addr);
 }
 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)
 
@@ -720,14 +833,16 @@
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -735,6 +850,8 @@
  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -752,8 +869,8 @@
  */
 
 __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
@@ -762,17 +879,34 @@
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else  /* defined(DST_DEPTH) */
+    int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
 
-    int  z_coord  = 0;
-    int4 offset   = 0;
-    int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
 
-    // We compute 2x2x2 [C,W,H] elements
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute 4x2x2 [C,W,H] elements
     VEC_INT acc0 = 0, sum0 = 0;
     VEC_INT acc1 = 0, sum1 = 0;
     VEC_INT acc2 = 0, sum2 = 0;
@@ -916,11 +1050,22 @@
     acc3 += (VEC_INT)K_OFFSET;
 #endif /* K_OFFSET != 0 */
 
+#if defined(REAL_MULTIPLIER)
+
+    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
+
     acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
     acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
     acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
     acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
+#endif // defined(REAL_MULTIPLIER)
+
     acc0 += (VEC_INT)OUTPUT_OFFSET;
     acc1 += (VEC_INT)OUTPUT_OFFSET;
     acc2 += (VEC_INT)OUTPUT_OFFSET;
@@ -936,25 +1081,29 @@
     res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
     res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
 
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#endif /* defined(DST_DEPTH) */
 
     VSTORE(VEC_SIZE)
-    (res0, 0, dst_addr + 0 * dst_stride_y);
+    (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
     VSTORE(VEC_SIZE)
-    (res1, 0, dst_addr + 1 * dst_stride_y);
+    (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
 
 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
 #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
     {
         VSTORE(VEC_SIZE)
-        (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
+        (ACTIVATION_FUNC(res2), 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
         VSTORE(VEC_SIZE)
-        (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
+        (ACTIVATION_FUNC(res3), 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
     }
 }
 
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product
  *
  * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)
@@ -963,15 +1112,19 @@
  * @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
+ * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
+ *       If not, the quantization will be performed using a fixed point multiplication
  *
- * @param[in] src_ptr                               Pointer to the source image. Supported data types: QASYMM8
- * @param[in] src_stride_x                          Stride of the source image in X dimension (in bytes)
+ * @param[in] src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                          Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                          Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source image
  * @param[in] src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_step_z                            src_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                          Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                            src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes     The offset of the first element in the source tensor
  * @param[in] dst_ptr                               Pointer to the destination tensor. Supported data types: QASYMM8
  * @param[in] dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
@@ -979,6 +1132,8 @@
  * @param[in] dst_step_y                            dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                            dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_w                          Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                            dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
  * @param[in] weights_ptr                           Pointer to the weights tensor. Supported data types: QASYMM8
  * @param[in] weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -992,34 +1147,52 @@
  * @param[in] biases_stride_x                       (Optional) Stride of the biases vector in X dimension (in bytes)
  * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases vector
+ * @param[in] max_offset                            The maximum allowed offset for the input tensor
  */
 
 __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
     TENSOR3D_DECLARATION(weights),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
+#endif // defined(HAS_BIAS)
     int max_offset)
 {
     int x = get_global_id(0);
     int y = get_global_id(1);
-    int z = get_global_id(2);
+#if defined(DST_DEPTH)
+    int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
+    int b = get_global_id(2) / (int)DST_DEPTH; // batch
+#else /* defined(DST_DEPTH) */
+    int z = get_global_id(2); // spatial coordinate y
+#endif /* defined(DST_DEPTH) */
 
     Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);
 
+#if defined(DST_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
+#else  /* defined(DST_DEPTH) */
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+#endif /* defined(DST_DEPTH) */
 
-    int  z_coord  = 0;
-    int4 offset   = 0;
-    int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;
+    int  z_coord = 0;
+    int4 offset  = 0;
+    int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;
 
-    // We compute 2x2x2 [C,W,H] elements
-    VEC_INT acc0 = 0, sum0 = 0;
-    VEC_INT acc1 = 0, sum1 = 0;
-    VEC_INT acc2 = 0, sum2 = 0;
-    VEC_INT acc3 = 0, sum3 = 0;
+    // Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1
+    y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
+    y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
+    y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
+    y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
+
+    int4 y_offset = convert_int4(y_coord * (int)src_stride_y);
+
+    // We compute 4x2x1 [C,W,H] elements
+    VEC_INT acc0 = 0;
+    VEC_INT acc1 = 0;
+    VEC_INT sum0 = 0;
+    VEC_INT sum1 = 0;
 
     // Load weights
     VEC_UCHAR w0 = VLOAD(VEC_SIZE)(0, weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z);
@@ -1033,17 +1206,21 @@
     VEC_UCHAR w8 = VLOAD(VEC_SIZE)(0, weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z);
 
 #if INPUT_OFFSET != 0
-    VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)
-                     + CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)
-                     + CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);
-#endif /* INPUT_OFFSET != 0 */
+    // Initilize the final result with the weights reduction multiplied by INPUT_OFFSET
+    DOT_PRODUCT_REDUCTION(acc0, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+
+    // Multiply the weights reduction with INPUT_OFFSET
+    acc0 = INPUT_OFFSET * acc0;
+
+    acc1 = acc0;
+#endif // INPUT_OFFSET != 0
 
     // Load input values
     // z == 0
     // Clamp z_coord as for z = 0, it can be negative
     // z_coord is casted to unsigned int in order to use just a min() operation
     // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;
+    z_coord = z - (int)CONV_PAD_TOP;
     z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
     offset  = y_offset + (int4)(z_coord * src_stride_z);
     offset  = min(offset, (int4)max_offset);
@@ -1056,7 +1233,7 @@
     // z == 1
     // z_coord can be only negative for z = 0 so we do not need to clamp it
     // Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset
-    z_coord           = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;
+    z_coord           = z - (int)CONV_PAD_TOP + 1;
     offset            = y_offset + (int4)(z_coord * src_stride_z);
     VEC_UCHAR values4 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
     VEC_UCHAR values5 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
@@ -1073,20 +1250,11 @@
     VEC_UCHAR values10 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
     VEC_UCHAR values11 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
 
-    // z == 3
-    // After z = 1 we can simply add src_stride_z to offset without updating z_coord
-    // However offset can be out-of-bound so we need to check if it is greater than max_offset
-    offset += (int4)(src_stride_z);
-    offset             = min(offset, (int4)max_offset);
-    VEC_UCHAR values12 = VLOAD(VEC_SIZE)(0, src_addr + offset.s0);
-    VEC_UCHAR values13 = VLOAD(VEC_SIZE)(0, src_addr + offset.s1);
-    VEC_UCHAR values14 = VLOAD(VEC_SIZE)(0, src_addr + offset.s2);
-    VEC_UCHAR values15 = VLOAD(VEC_SIZE)(0, src_addr + offset.s3);
+    DOT_PRODUCT_REDUCTION(sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10);
+    DOT_PRODUCT_ACCUMULATE(acc0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
 
-    DOT_PRODUCT_ACCUMULATE(acc0, sum0, values0, values1, values2, values4, values5, values6, values8, values9, values10, w0, w1, w2, w3, w4, w5, w6, w7, w8);
-    DOT_PRODUCT_ACCUMULATE(acc1, sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
-    DOT_PRODUCT_ACCUMULATE(acc2, sum2, values4, values5, values6, values8, values9, values10, values12, values13, values14, w0, w1, w2, w3, w4, w5, w6, w7, w8);
-    DOT_PRODUCT_ACCUMULATE(acc3, sum3, values5, values6, values7, values9, values10, values11, values13, values14, values15, w0, w1, w2, w3, w4, w5, w6, w7, w8);
+    DOT_PRODUCT_REDUCTION(sum1, values1, values2, values3, values5, values6, values7, values9, values10, values11);
+    DOT_PRODUCT_ACCUMULATE(acc1, values1, values2, values3, values5, values6, values7, values9, values10, values11, w0, w1, w2, w3, w4, w5, w6, w7, w8);
 
 #if defined(HAS_BIAS)
     Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
@@ -1095,74 +1263,56 @@
 
     acc0 += bias_values;
     acc1 += bias_values;
-    acc2 += bias_values;
-    acc3 += bias_values;
-#endif /* defined(HAS_BIAS) */
+
+#endif // defined(HAS_BIAS)
 
 #if WEIGHTS_OFFSET != 0
     acc0 += WEIGHTS_OFFSET * sum0;
     acc1 += WEIGHTS_OFFSET * sum1;
-    acc2 += WEIGHTS_OFFSET * sum2;
-    acc3 += WEIGHTS_OFFSET * sum3;
-#endif /* WEIGHTS_OFFSET != 0 */
-
-#if INPUT_OFFSET != 0
-    VEC_INT offs = INPUT_OFFSET * sum_we;
-
-    acc0 += offs;
-    acc1 += offs;
-    acc2 += offs;
-    acc3 += offs;
-#endif /* INPUT_OFFSET != 0 */
+#endif // WEIGHTS_OFFSET != 0
 
 #if K_OFFSET != 0
     acc0 += (VEC_INT)K_OFFSET;
     acc1 += (VEC_INT)K_OFFSET;
-    acc2 += (VEC_INT)K_OFFSET;
-    acc3 += (VEC_INT)K_OFFSET;
-#endif /* K_OFFSET != 0 */
+
+#endif // K_OFFSET != 0
+
+#if defined(REAL_MULTIPLIER)
+
+    acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+    acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
+
+#else // defined(REAL_MULTIPLIER)
 
     acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
     acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
-    acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
 
+#endif // defined(REAL_MULTIPLIER)
     acc0 += (VEC_INT)OUTPUT_OFFSET;
     acc1 += (VEC_INT)OUTPUT_OFFSET;
-    acc2 += (VEC_INT)OUTPUT_OFFSET;
-    acc3 += (VEC_INT)OUTPUT_OFFSET;
 
     VEC_UCHAR res0 = CONVERT_SAT(acc0, VEC_UCHAR);
     VEC_UCHAR res1 = CONVERT_SAT(acc1, VEC_UCHAR);
-    VEC_UCHAR res2 = CONVERT_SAT(acc2, VEC_UCHAR);
-    VEC_UCHAR res3 = CONVERT_SAT(acc3, VEC_UCHAR);
 
     res0 = CLAMP(res0, (VEC_UCHAR)0, (VEC_UCHAR)255);
     res1 = CLAMP(res1, (VEC_UCHAR)0, (VEC_UCHAR)255);
-    res2 = CLAMP(res2, (VEC_UCHAR)0, (VEC_UCHAR)255);
-    res3 = CLAMP(res3, (VEC_UCHAR)0, (VEC_UCHAR)255);
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
+#if defined(DST_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
+#else  /* defined(DST_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
+#endif /* defined(DST_DEPTH) */
 
     VSTORE(VEC_SIZE)
-    (res0, 0, dst_addr + 0 * dst_stride_y);
+    (ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);
     VSTORE(VEC_SIZE)
-    (res1, 0, dst_addr + 1 * dst_stride_y);
-
-#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
-#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)
-    {
-        VSTORE(VEC_SIZE)
-        (res2, 0, dst_addr + 0 * dst_stride_y + 1 * dst_stride_z);
-        VSTORE(VEC_SIZE)
-        (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z);
-    }
+    (ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);
 }
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)
 
 #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
 
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))

diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 70be058..5299409 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl

@@ -194,11 +194,11 @@
     __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z;
 
     weights_addr += id0 * weights_stride_w;
-    const int coordy = id2 - PAD_TOP;
 
+#if(PAD_TOP == 1)
+    const int coordy = id2 - PAD_TOP;
     for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
     {
-#if(PAD_TOP)
         if(coordy < 0) // special case Z = -1 doesn't exists
         {
             //skip first row and load the two next ones
@@ -224,17 +224,69 @@
             CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
             CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
         }
-#else  //PAD_TOP > 0
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+#elif(PAD_TOP == 2)
+    const int coordy = id2 * STRIDE_Y;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        if(coordy == 0) // special case Z = -2 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        else if(coordy == 1) // special case Z = -1 doesn't exists
+        {
+            //skip first row and load the two next ones
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 1))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+        }
+        else if(coordy == (SRC_HEIGHT - 2))
+        {
+            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
+            // Z axis has no padding at all.
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+        }
+        else
+        {
+            CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+        }
+        src_addr += src_stride_x;
+        weights_addr += weights_stride_x;
+    }
+
+#else  /*  PAD_TOP == 2 */
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
         CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr);
         CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
         CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
         CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
         CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-#endif // PAD_TOP > 0
-
         src_addr += src_stride_x;
         weights_addr += weights_stride_x;
     }
+#endif /*  PAD_TOP == 1 */
 
 #ifdef HAS_BIAS
     Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl
index df0f9c4..02694f7 100644
--- a/src/core/CL/cl_kernels/flatten.cl
+++ b/src/core/CL/cl_kernels/flatten.cl

@@ -23,12 +23,13 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH)
 
 /** This opencl kernel flattens the first 3 dimensions of the input tensor
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24
+ * @note The width, height and depth of the input tensor must be passed at compile time using -DSRC_WIDTH, -DSRC_HEIGHT and -DSRC_DEPTH. e.g. -DSRC_WIDTH=24, -DSRC_HEIGHT=24, -DSRC_DEPTH=16
+ * @note If the output has 3 dimensions, the 2nd dimension of the output tensor must be passed at compile time using -DDST_DIM1. e.g -DDST_DIM1=3
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
@@ -37,20 +38,38 @@
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void flatten(
-    TENSOR3D_DECLARATION(src),
-    VECTOR_DECLARATION(dst))
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
 
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) + get_global_id(1) * (int)SRC_WIDTH + get_global_id(2) * (int)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(
-                                     DATA_TYPE);
+    uint c  = get_global_id(2) % SRC_DEPTH; // input feature map
+    uint b0 = get_global_id(2) / SRC_DEPTH; // batch id
+    uint b1 = 0;
+
+#if defined(DST_DIM1)
+    uint b_tmp = b0;
+    b0 = b_tmp % DST_DIM1; // batch id0
+    b1 = b_tmp / DST_DIM1; // batch id1
+#endif // defined(DST_DIM1)
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                                 (get_global_id(0) + get_global_id(1) * (uint)SRC_WIDTH + c * (uint)(SRC_WIDTH * SRC_HEIGHT)) * sizeof(DATA_TYPE) +
+                                 b0 * dst_stride_y +
+                                 b1 * dst_stride_z;
 
     *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)src.ptr);
 }

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 932e0d6..7de15d0 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -84,7 +84,8 @@
 
 #if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
 
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block
+ * will be simply unrolled.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
@@ -187,6 +188,12 @@
     a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
 #endif // defined(REINTERPRET_INPUT_AS_3D)
 
+#if defined(UNROLL_BLOCK)
+    vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
+    vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+    vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
+    vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
+#else // defined(UNROLL_BLOCK)
     VEC_DATA_TYPE(DATA_TYPE, 4)
     val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
     vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
@@ -199,6 +206,7 @@
 
     val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
     vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
+#endif // defined(UNROLL_BLOCK)
 }
 #endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
 
@@ -871,6 +879,183 @@
 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
 
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float8 c00 = 0.0f;
+    float8 c10 = 0.0f;
+    float8 c20 = 0.0f;
+    float8 c30 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c00 += (float8)a0.s0 * b0;
+        c10 += (float8)a0.s1 * b0;
+        c20 += (float8)a0.s2 * b0;
+        c30 += (float8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+        b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+        c00 += (float8)a0.s0 * b0;
+        c10 += (float8)a0.s1 * b0;
+        c20 += (float8)a0.s2 * b0;
+        c30 += (float8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c00 += (float8)a0.s0 * b0;
+        c10 += (float8)a0.s1 * b0;
+        c20 += (float8)a0.s2 * b0;
+        c30 += (float8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(ALPHA)
+    // Multiply by the weight of matrix product
+    c00 = c00 * (float8)ALPHA;
+    c10 = c10 * (float8)ALPHA;
+    c20 = c20 * (float8)ALPHA;
+    c30 = c30 * (float8)ALPHA;
+#endif // defined(ALPHA)
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+    // Store 4x8 block
+    vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+    // Store 4x8 block
+    vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+    vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+    vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+    vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+
 /** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
  *
@@ -2291,6 +2476,354 @@
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 /** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    int i = 0;
+    for(; i <= ((int)COLS_A - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)COLS_A; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                    // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    }
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    half8 hacc0 = convert_half8(acc0) * (half8)ALPHA;
+#else  //defined(ALPHA)
+    half8 hacc0 = convert_half8(acc0);
+#endif // defined(ALPHA)
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(ALPHA)
+    half8 hacc1 = convert_half8(acc1) * (half8)ALPHA;
+#else  //defined(ALPHA)
+    half8 hacc1 = convert_half8(acc1);
+#endif //defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(ALPHA)
+    half8 hacc2 = convert_half8(acc2) * (half8)ALPHA;
+#else  //defined(ALPHA)
+    half8 hacc2 = convert_half8(acc2);
+#endif //defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(ALPHA)
+    half8 hacc3 = convert_half8(acc3) * (half8)ALPHA;
+#else  //defined(ALPHA)
+    half8 hacc3 = convert_half8(acc3);
+#endif // defined(ALPHA)
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    int z = get_global_id(2);
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+    // Store the output block
+    vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+    // Store the output block
+    vstore8(hacc0, 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore8(hacc1, 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore8(hacc2, 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore8(hacc3, 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // REINTERPRET_OUTPUT_AS_3D
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
  * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
  * This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index cd8b269..8c1fa54 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl

@@ -24,13 +24,13 @@
 #include "helpers.h"
 #include "helpers_asymm.h"
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val);
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -40,6 +40,12 @@
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -58,13 +64,26 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
-                                                         IMAGE_DECLARATION(dst))
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
 {
-    int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+    const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    const int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -72,9 +91,16 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
     __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr_b += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     // Compute end row address for matrix B
     __global uchar *src_end_addr_b = src_addr_b + COLS_B;
 
@@ -122,11 +148,49 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
     // Store 4x4 block
-    vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));
-    vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));
-    vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));
-    vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));
+    vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+    vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+    vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+    vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += z * dst_stride_z;
+
+    // Store 4x4 block
+    vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
 
 /** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
@@ -136,6 +200,12 @@
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -154,13 +224,26 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
-                                                         IMAGE_DECLARATION(dst))
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
 {
-    int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
+    const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+    const int z = get_global_id(2);
 
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
@@ -168,9 +251,16 @@
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);
     __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr_b += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     // Compute end row address for matrix B
     __global uchar *src_end_addr_b = src_addr_b + COLS_B;
 
@@ -416,14 +506,52 @@
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
     // Store 4x4 block
-    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
-    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
-    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
-    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
+    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += z * dst_stride_z;
+
+    // Store 4x4 block
+    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
 
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 /** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
  *
@@ -431,6 +559,12 @@
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
  * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
  *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -449,25 +583,38 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),
                                                               IMAGE_DECLARATION(src1),
-                                                              IMAGE_DECLARATION(dst))
+                                                              IMAGE_DECLARATION(dst),
+                                                              uint src0_stride_z,
+                                                              uint src1_stride_z,
+                                                              uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                              ,
+                                                              uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                             )
 {
-    int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;
-    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
-
     // Offset
     const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
     const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;
 
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
+    __global uchar *src_addr_a = (__global uchar *)(src0_ptr + (get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT) * src0_stride_y + get_global_id(2) * src0_stride_z + src0_offset_first_element_in_bytes);
+    __global uchar *src_addr_b = (__global uchar *)(src1_ptr + (get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP) * src1_stride_y + src1_offset_first_element_in_bytes);
 
-    // Compute end row address for matrix B
-    __global uchar *src_end_addr_b = src_addr_b + COLS_B;
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr_b += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr_b += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     src_addr_a += offset_row_a;
     src_addr_b += offset_row_b;
@@ -477,21 +624,27 @@
     uint c01 = 0;
     uint c02 = 0;
     uint c03 = 0;
+
     uint c10 = 0;
     uint c11 = 0;
     uint c12 = 0;
     uint c13 = 0;
+
     uint c20 = 0;
     uint c21 = 0;
     uint c22 = 0;
     uint c23 = 0;
+
     uint c30 = 0;
     uint c31 = 0;
     uint c32 = 0;
     uint c33 = 0;
 
+#define COLS_MTX_B (COLS_B / (16 * MULT_TRANSPOSE1XW_WIDTH))
+
 #if MULT_INTERLEAVE4X4_HEIGHT == 1
-    for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))
+    int i = 0;
+    for(; i <= (int)(COLS_MTX_B - 8); i += 8)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         uchar16 a0 = vload16(0, src_addr_a);
@@ -499,95 +652,138 @@
         uchar4  b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);
         uchar4  b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);
         uchar4  b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);
+        uchar4  b4 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
+        uchar4  b5 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
+        uchar4  b6 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
+        uchar4  b7 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
 
         // Accumulate
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c00);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c01);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c02);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c03);
 
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c10);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c11);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c12);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c13);
 
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c20);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c21);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c22);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c23);
 
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c30);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c31);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c32);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c33);
 
-        // Load values from matrix A (interleaved) and matrix B (transposed)
+        // Accumulate
         a0 = vload16(0, src_addr_a + 16);
-        b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);
-        b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);
-        b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);
-        b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);
 
-        // Accumulate
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);
-        ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c00);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c01);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c02);
+        ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c03);
 
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);
-        ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c10);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c11);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c12);
+        ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c13);
 
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);
-        ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c20);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c21);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c22);
+        ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c23);
 
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);
-        ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c30);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c31);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c32);
+        ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c33);
+
+        src_addr_a += 32;
+        src_addr_b += 32 * TRANSPOSE1XW_WIDTH_STEP;
     }
 #endif // MULT_INTERLEAVE4X4_HEIGHT == 1
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))
+    int i_left_over = 0;
+    for(; i < (int)(COLS_MTX_B); ++i)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        uchar4 a0 = vload4(0, src_addr_a);
-        uchar4 b0 = vload4(0, src_addr_b);
+        uchar16 a0 = vload16(0, src_addr_a + (i_left_over % 4) + ((i_left_over / 4) * 16));
+        uchar4  b0 = vload4(0, src_addr_b);
 
-        c00 += (ushort)a0.s0 * b0.s0;
-        c01 += (ushort)a0.s0 * b0.s1;
-        c02 += (ushort)a0.s0 * b0.s2;
-        c03 += (ushort)a0.s0 * b0.s3;
+        c00 += a0.s0 * b0.s0;
+        c01 += a0.s0 * b0.s1;
+        c02 += a0.s0 * b0.s2;
+        c03 += a0.s0 * b0.s3;
 
-        c10 += (ushort)a0.s1 * b0.s0;
-        c11 += (ushort)a0.s1 * b0.s1;
-        c12 += (ushort)a0.s1 * b0.s2;
-        c13 += (ushort)a0.s1 * b0.s3;
+        c10 += a0.s4 * b0.s0;
+        c11 += a0.s4 * b0.s1;
+        c12 += a0.s4 * b0.s2;
+        c13 += a0.s4 * b0.s3;
 
-        c20 += (ushort)a0.s2 * b0.s0;
-        c21 += (ushort)a0.s2 * b0.s1;
-        c22 += (ushort)a0.s2 * b0.s2;
-        c23 += (ushort)a0.s2 * b0.s3;
+        c20 += a0.s8 * b0.s0;
+        c21 += a0.s8 * b0.s1;
+        c22 += a0.s8 * b0.s2;
+        c23 += a0.s8 * b0.s3;
 
-        c30 += (ushort)a0.s3 * b0.s0;
-        c31 += (ushort)a0.s3 * b0.s1;
-        c32 += (ushort)a0.s3 * b0.s2;
-        c33 += (ushort)a0.s3 * b0.s3;
+        c30 += a0.sC * b0.s0;
+        c31 += a0.sC * b0.s1;
+        c32 += a0.sC * b0.s2;
+        c33 += a0.sC * b0.s3;
+
+        i_left_over++;
+        src_addr_b += 4 * TRANSPOSE1XW_WIDTH_STEP;
     }
 
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
     // Store 4x4 block
-    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));
-    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));
-    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));
-    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));
+    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
+    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
+    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
+    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
+
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += get_global_id(2) * dst_stride_z;
+
+    // Store 4x4 block
+    vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+    vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+    vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+    vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
 #endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
 
@@ -599,6 +795,13 @@
  *
  * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
  *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -617,10 +820,27 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
                                   IMAGE_DECLARATION(src1),
-                                  IMAGE_DECLARATION(dst))
+                                  IMAGE_DECLARATION(dst),
+                                  uint src0_stride_z,
+                                  uint src1_stride_z,
+                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                  ,
+                                  uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                  ,
+                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                 )
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -633,6 +853,47 @@
     // Update address for the matrix B
     src_addr.s1 += idx;
 
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     int end_row_vec_a = src_addr.s0 + COLS_A;
 
     VECTOR_UINT acc0 = 0;
@@ -725,34 +986,95 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     }
 
+    const int z = get_global_id(2);
+
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
     // Store the result
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
-    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));
+    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += z * dst_stride_z;
+
+    // Store the result
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
+    (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
 
 /** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
  * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
  *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -771,10 +1093,27 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),
                                   IMAGE_DECLARATION(src1),
-                                  IMAGE_DECLARATION(dst))
+                                  IMAGE_DECLARATION(dst),
+                                  uint src0_stride_z,
+                                  uint src1_stride_z,
+                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                  ,
+                                  uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                  ,
+                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                 )
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -787,6 +1126,47 @@
     // Update address for the matrix B
     src_addr.s1 += idx;
 
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
     int end_row_vec_a = src_addr.s0 + COLS_A;
 
     uint acc00 = 0;
@@ -1075,30 +1455,86 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
     }
 
+    const int z = get_global_id(2);
+
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
+
     // Store the result
-    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
+    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
+    vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst.ptr += z * dst_stride_z;
+
+    // Store the result
+    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+    vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
 }
 
-#if ARM_COMPUTE_OPENCL_DOT8_ENABLED
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 /** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
  *
  * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
  *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
  * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data type: QASYMM8
  * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1117,10 +1553,27 @@
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),
                                        IMAGE_DECLARATION(src1),
-                                       IMAGE_DECLARATION(dst))
+                                       IMAGE_DECLARATION(dst),
+                                       uint src0_stride_z,
+                                       uint src1_stride_z,
+                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                       ,
+                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                       ,
+                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D)
+                                      )
 {
     int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
 
@@ -1133,214 +1586,361 @@
     // Update address for the matrix B
     src_addr.s1 += idx;
 
-    int end_row_vec_a = src_addr.s0 + COLS_A;
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
 
     uint acc00 = 0;
     uint acc01 = 0;
     uint acc02 = 0;
     uint acc03 = 0;
+    uint acc04 = 0;
+    uint acc05 = 0;
+    uint acc06 = 0;
+    uint acc07 = 0;
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     uint acc10 = 0;
     uint acc11 = 0;
     uint acc12 = 0;
     uint acc13 = 0;
+    uint acc14 = 0;
+    uint acc15 = 0;
+    uint acc16 = 0;
+    uint acc17 = 0;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     uint acc20 = 0;
     uint acc21 = 0;
     uint acc22 = 0;
     uint acc23 = 0;
+    uint acc24 = 0;
+    uint acc25 = 0;
+    uint acc26 = 0;
+    uint acc27 = 0;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     uint acc30 = 0;
     uint acc31 = 0;
     uint acc32 = 0;
     uint acc33 = 0;
+    uint acc34 = 0;
+    uint acc35 = 0;
+    uint acc36 = 0;
+    uint acc37 = 0;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    uint acc40 = 0;
-    uint acc41 = 0;
-    uint acc42 = 0;
-    uint acc43 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
 
-    for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)COLS_A - 8); i += 8)
     {
-        // Load values from matrix A
-        uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+        uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+        uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+        uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-        uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-        // Load values from matrix B
-        uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
-        uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
-        uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
-        uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
 
-        {
-            // Accumulate
-            ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a0.s0, a0.s1, a0.s2, a0.s3, acc00);
-            ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a0.s0, a0.s1, a0.s2, a0.s3, acc01);
-            ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a0.s0, a0.s1, a0.s2, a0.s3, acc02);
-            ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a0.s0, a0.s1, a0.s2, a0.s3, acc03);
-        }
+        uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
+        uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
+        uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
+        uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+        src_addr.s1 += 4 * src1_stride_y;
+
+        ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
+        ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
+
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        {
-            // Accumulate
-            ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a1.s0, a1.s1, a1.s2, a1.s3, acc10);
-            ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a1.s0, a1.s1, a1.s2, a1.s3, acc11);
-            ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a1.s0, a1.s1, a1.s2, a1.s3, acc12);
-            ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a1.s0, a1.s1, a1.s2, a1.s3, acc13);
-        }
+        ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
+        ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        {
-            // Accumulate
-            ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a2.s0, a2.s1, a2.s2, a2.s3, acc20);
-            ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a2.s0, a2.s1, a2.s2, a2.s3, acc21);
-            ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a2.s0, a2.s1, a2.s2, a2.s3, acc22);
-            ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a2.s0, a2.s1, a2.s2, a2.s3, acc23);
-        }
+        ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
+        ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        {
-            // Accumulate
-            ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a3.s0, a3.s1, a3.s2, a3.s3, acc30);
-            ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a3.s0, a3.s1, a3.s2, a3.s3, acc31);
-            ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a3.s0, a3.s1, a3.s2, a3.s3, acc32);
-            ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a3.s0, a3.s1, a3.s2, a3.s3, acc33);
-        }
+        ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
+        ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-        {
-            // Accumulate
-            ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a4.s0, a4.s1, a4.s2, a4.s3, acc40);
-            ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a4.s0, a4.s1, a4.s2, a4.s3, acc41);
-            ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a4.s0, a4.s1, a4.s2, a4.s3, acc42);
-            ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a4.s0, a4.s1, a4.s2, a4.s3, acc43);
-        }
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+
+        b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);
+        b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);
+        b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);
+        b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);
+        src_addr.s1 += 4 * src1_stride_y;
+
+        ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);
+        ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);
+        ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);
+        ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);
+        ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        src_addr.s0 += 8;
     }
 
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
+    for(; i < (int)COLS_A; ++i)
     {
+#if defined(REINTERPRET_INPUT_AS_3D)
         // Load values from matrix A
-        uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);
+        uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);
+        uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);
+        uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);
+        uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-        uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
         // Load values from matrix B
-        uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);
+        uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);
+        src_addr.s1 += src1_stride_y;
 
-        // Accumulate
-        {
-            // Accumulate
-            ushort tmp0 = (ushort)b0.s0 * (ushort)a0;
-            ushort tmp1 = (ushort)b0.s1 * (ushort)a0;
-            ushort tmp2 = (ushort)b0.s2 * (ushort)a0;
-            ushort tmp3 = (ushort)b0.s3 * (ushort)a0;
+        acc00 += (uint)a0 * b0.s0;
+        acc01 += (uint)a0 * b0.s1;
+        acc02 += (uint)a0 * b0.s2;
+        acc03 += (uint)a0 * b0.s3;
+        acc04 += (uint)a0 * b0.s4;
+        acc05 += (uint)a0 * b0.s5;
+        acc06 += (uint)a0 * b0.s6;
+        acc07 += (uint)a0 * b0.s7;
 
-            acc00 += ((uint)tmp0);
-            acc01 += ((uint)tmp1);
-            acc02 += ((uint)tmp2);
-            acc03 += ((uint)tmp3);
-        }
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        {
-            // Accumulate
-            ushort tmp0 = (ushort)b0.s0 * (ushort)a1;
-            ushort tmp1 = (ushort)b0.s1 * (ushort)a1;
-            ushort tmp2 = (ushort)b0.s2 * (ushort)a1;
-            ushort tmp3 = (ushort)b0.s3 * (ushort)a1;
-
-            acc10 += ((uint)tmp0);
-            acc11 += ((uint)tmp1);
-            acc12 += ((uint)tmp2);
-            acc13 += ((uint)tmp3);
-        }
+        acc10 += (uint)a1 * b0.s0;
+        acc11 += (uint)a1 * b0.s1;
+        acc12 += (uint)a1 * b0.s2;
+        acc13 += (uint)a1 * b0.s3;
+        acc14 += (uint)a1 * b0.s4;
+        acc15 += (uint)a1 * b0.s5;
+        acc16 += (uint)a1 * b0.s6;
+        acc17 += (uint)a1 * b0.s7;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        {
-            // Accumulate
-            ushort tmp0 = (ushort)b0.s0 * (ushort)a2;
-            ushort tmp1 = (ushort)b0.s1 * (ushort)a2;
-            ushort tmp2 = (ushort)b0.s2 * (ushort)a2;
-            ushort tmp3 = (ushort)b0.s3 * (ushort)a2;
-
-            acc20 += ((uint)tmp0);
-            acc21 += ((uint)tmp1);
-            acc22 += ((uint)tmp2);
-            acc23 += ((uint)tmp3);
-        }
+        acc20 += (uint)a2 * b0.s0;
+        acc21 += (uint)a2 * b0.s1;
+        acc22 += (uint)a2 * b0.s2;
+        acc23 += (uint)a2 * b0.s3;
+        acc24 += (uint)a2 * b0.s4;
+        acc25 += (uint)a2 * b0.s5;
+        acc26 += (uint)a2 * b0.s6;
+        acc27 += (uint)a2 * b0.s7;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        {
-            // Accumulate
-            ushort tmp0 = (ushort)b0.s0 * (ushort)a3;
-            ushort tmp1 = (ushort)b0.s1 * (ushort)a3;
-            ushort tmp2 = (ushort)b0.s2 * (ushort)a3;
-            ushort tmp3 = (ushort)b0.s3 * (ushort)a3;
-
-            acc30 += ((uint)tmp0);
-            acc31 += ((uint)tmp1);
-            acc32 += ((uint)tmp2);
-            acc33 += ((uint)tmp3);
-        }
+        acc30 += (uint)a3 * b0.s0;
+        acc31 += (uint)a3 * b0.s1;
+        acc32 += (uint)a3 * b0.s2;
+        acc33 += (uint)a3 * b0.s3;
+        acc34 += (uint)a3 * b0.s4;
+        acc35 += (uint)a3 * b0.s5;
+        acc36 += (uint)a3 * b0.s6;
+        acc37 += (uint)a3 * b0.s7;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-        {
-            // Accumulate
-            ushort tmp0 = (ushort)b0.s0 * (ushort)a4;
-            ushort tmp1 = (ushort)b0.s1 * (ushort)a4;
-            ushort tmp2 = (ushort)b0.s2 * (ushort)a4;
-            ushort tmp3 = (ushort)b0.s3 * (ushort)a4;
 
-            acc40 += ((uint)tmp0);
-            acc41 += ((uint)tmp1);
-            acc42 += ((uint)tmp2);
-            acc43 += ((uint)tmp3);
-        }
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
+        src_addr.s0 += 1;
     }
 
+    int z = get_global_id(2);
+
     // Compute destination address
     Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
 
+    // Compute dst address
+    __global uchar *dst_addr = dst.ptr;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
+    uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
+    zout       = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
     // Store the result
-    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));
+    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
+    vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));
+    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
+    vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));
+    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
+    vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));
+    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
+    vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-    vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-}
-#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED
 
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+    // Store the result
+    vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));
+    vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));
+    vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));
+    vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
+    vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
 
 #if defined(COLS_A)
@@ -1398,6 +1998,70 @@
 
     *((__global int *)dst.ptr) = (int)sum_row;
 }
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+                                               IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    uint sum_row = 0;
+
+    __global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 32); i += 32)
+    {
+        uchar16 a0_u8 = vload16(0, matrix_a + i);
+
+        sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
+
+        a0_u8 = vload16(1, matrix_a + i);
+
+        sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));
+        sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += matrix_a[i];
+    }
+
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #endif // defined(COLS_A)
 
 #if defined(COLS_B) && defined(ROWS_B)
@@ -1463,6 +2127,101 @@
 #endif // defined(COLS_B) && defined(ROWS_B)
 
 #if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ */
+inline int4 offset_contribution(
+    int x,
+    int y,
+    int z
+#if defined(A_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+    int4 a_offset_s32 = (int4)0;
+    int4 b_offset_s32 = (int4)0;
+
+    int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+    batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+    a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+    // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
 /* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
  *
  * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),
@@ -1480,26 +2239,30 @@
  *                   (sum_row[i] * B_OFFSET) +
  *                   (K_OFFSET)
  *
- * @param[in] mm_result_ptr                                Pointer to the source tensor. Supported data type: S32
- * @param[in] mm_result_stride_x                           Stride of the source tensor in X dimension (in bytes)
- * @param[in] mm_result_step_x                             mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mm_result_stride_y                           Stride of the source tensor in Y dimension (in bytes)
- * @param[in] mm_result_step_y                             mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] mm_result_stride_z                           Stride of the source tensor in Z dimension (in bytes)
- * @param[in] mm_result_step_z                             mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] mm_result_offset_first_element_in_bytes      The offset of the first element in the source tensor
- * @param[in] sum_col_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_result_step_x                        sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_result_step_y                        sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_row_result_ptr                           Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_result_step_x                        sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_result_step_y                        sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
  */
 __kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
 #if defined(A_OFFSET)
@@ -1510,44 +2273,348 @@
                                            ,
                                            IMAGE_DECLARATION(sum_row)
 #endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                           ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
                                           )
 {
-    Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
 
-    int4 a_offset_s32 = (int4)0;
-    int4 b_offset_s32 = (int4)0;
-
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
 #if defined(A_OFFSET)
-    Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);
-
-    // Compute the offset contribution due to A_OFFSET
-#if defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));
-#else  // defined(MATRIX_B_HAS_BATCHES)
-    a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));
-#endif // defined(MATRIX_B_HAS_BATCHES)
-
-    a_offset_s32 *= (int4)A_OFFSET;
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
 #endif // defined(A_OFFSET)
-
 #if defined(B_OFFSET)
-    Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);
-
-    // Compute the offset contribution due to B_OFFSET
-    b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));
-    b_offset_s32 *= (int4)B_OFFSET;
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
 #endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
 
-    const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
 
-    int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
 
     // Add the offset terms to GEMM's result
     in_s32 += offset_term_s32;
 
     // Store the result with the offset contribution
-    vstore4(in_s32, 0, (__global int *)mm_result.ptr);
+    vstore4(in_s32, 0, (__global int *)mm_result_addr);
 }
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                 Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                            Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                              dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                            Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                              dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                              src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes       The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                         ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst))
+{
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
+#if defined(A_OFFSET)
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (int4)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+    in_s32 *= RESULT_MULTIPLIER;
+
+    in_s32 >>= RESULT_SHIFT;
+
+    uchar4 res = convert_uchar4_sat(in_s32);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                 Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                            Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                              dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                            Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                              dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                              src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes       The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                                    ,
+#if defined(ADD_BIAS)
+                                                                    VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                    TENSOR3D_DECLARATION(dst))
+{
+    const int x = get_global_id(0) * 4;
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    int4 offset_term_s32 = offset_contribution(
+                               x, y, z
+#if defined(A_OFFSET)
+                               ,
+                               sum_col_ptr,
+                               sum_col_stride_x,
+                               sum_col_step_x,
+                               sum_col_stride_y,
+                               sum_col_step_y,
+                               sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                               ,
+                               sum_row_ptr,
+                               sum_row_stride_x,
+                               sum_row_step_x,
+                               sum_row_stride_y,
+                               sum_row_step_y,
+                               sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                               ,
+                               biases_ptr,
+                               biases_stride_x,
+                               biases_step_x,
+                               biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                           );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Multiply by result_mult_int and shift
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (int4)RESULT_OFFSET;
+
+    uchar4 res = convert_uchar4_sat(in_s32);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, dst_addr);
+}
+#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
 #endif // defined(K_OFFSET)
 
 #if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
@@ -1577,10 +2644,10 @@
  * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
  * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
  * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
@@ -1597,39 +2664,43 @@
                                                   TENSOR3D_DECLARATION(dst))
 {
     // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#if defined(ADD_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-#endif // defined(ADD_BIAS)
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-    int16 input_values = vload16(0, (__global int *)src.ptr);
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
 
-    // Add the offset terms to GEMM's result
-    input_values += (int16)RESULT_OFFSET;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
-    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
-    input_values += (int16)biases_values;
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
 #endif // defined(ADD_BIAS)
 
+    // Add the offset terms to GEMM's result
+    input_values += (int4)RESULT_OFFSET;
+
     // Multiply by result_mult_int and shift
     input_values *= RESULT_MULT_INT;
 
     input_values >>= RESULT_SHIFT;
 
-    uchar16 res = convert_uchar16_sat(input_values);
+    uchar4 res = convert_uchar4_sat(input_values);
 
 #if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
+    res = max(res, (uchar4)MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
+    res = min(res, (uchar4)MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore16(res, 0, dst.ptr);
+    vstore4(res, 0, dst_addr);
 }
 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
 
@@ -1646,7 +2717,92 @@
  *  -# Clamp the value between the specified min and max bounds
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+
+    // Add the offset terms to GEMM's result
+    input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+    uchar4 res = convert_uchar4_sat(input_values);
+
+#if defined(MIN_BOUND)
+    res = max(res, (uchar4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res = min(res, (uchar4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    vstore4(res, 0, dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
  *
  * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
  * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
@@ -1671,45 +2827,53 @@
  * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
  */
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
 #if defined(ADD_BIAS)
-                                                             VECTOR_DECLARATION(biases),
+                                                        VECTOR_DECLARATION(biases),
 #endif // defined(ADD_BIAS)
-                                                             TENSOR3D_DECLARATION(dst))
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
 {
     // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#if defined(ADD_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-#endif // defined(ADD_BIAS)
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-    int16 input_values = vload16(0, (__global int *)src.ptr);
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
-    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
-    input_values += (int16)biases_values;
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
 #endif // defined(ADD_BIAS)
 
-    // Multiply by result_mult_int and shift
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);
+    // Convert to float
+    float16 input_values_f = convert_float4(input_values);
+    input_values_f         = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
-    // Add the offset terms to GEMM's result
-    input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;
-
-    uchar16 res = convert_uchar16_sat(input_values);
+    uchar4 res = convert_uchar4_sat(input_values_f);
 
 #if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
+    res = max(res, (uchar4)MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
+    res = min(res, (uchar4)MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore16(res, 0, dst.ptr);
+    vstore4(res, 0, dst_addr);
 }
-#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)

diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
new file mode 100644
index 0000000..bc6f4b5
--- /dev/null
+++ b/src/core/CL/cl_kernels/generate_proposals.cl

@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const DATA_TYPE
+    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
+    const DATA_TYPE
+    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+    // Apply the shift to the anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 3f7a2a5..7ee97d9 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -24,23 +24,21 @@
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
 #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
 
-#if defined(ARM_COMPUTE_DEBUG_ENABLED)
-#if defined(cl_arm_printf)
+#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 #pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf)
-#endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
+#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
 #define EXPAND(x) x
 
@@ -185,7 +183,7 @@
  *
  * @return An image object
  */
-Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
     Vector vector =
     {
@@ -208,7 +206,7 @@
  *
  * @return An image object
  */
-Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
     Image img =
     {
@@ -234,7 +232,7 @@
  *
  * @return A 3D tensor object
  */
-Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
 {
     Image img =
     {
@@ -260,7 +258,7 @@
  *
  * @return A 3D tensor object
  */
-Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
 {
     Tensor3D tensor =
     {
@@ -274,7 +272,7 @@
     return tensor;
 }
 
-Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
                                              uint step_w,
                                              uint mod_size)
 {
@@ -297,7 +295,7 @@
  * @param[in] vec Pointer to the starting position of the buffer
  * @param[in] x   Relative X position
  */
-__global inline const uchar *vector_offset(const Vector *vec, int x)
+inline __global const uchar *vector_offset(const Vector *vec, int x)
 {
     return vec->ptr + x * vec->stride_x;
 }
@@ -308,7 +306,7 @@
  * @param[in] x   Relative X position
  * @param[in] y   Relative Y position
  */
-__global inline uchar *offset(const Image *img, int x, int y)
+inline __global uchar *offset(const Image *img, int x, int y)
 {
     return img->ptr + x * img->stride_x + y * img->stride_y;
 }
@@ -320,7 +318,7 @@
  * @param[in] y      Relative Y position
  * @param[in] z      Relative Z position
  */
-__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
+inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
 {
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
 }
@@ -333,7 +331,7 @@
  * @param[in] z      Relative Z position
  * @param[in] w      Relative W position
  */
-__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
+inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
 {
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
 }

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index a69bcc1..c314d17 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -62,6 +62,7 @@
         b_64 = convert_long##size(b);                                                                        \
         VEC_DATA_TYPE(long, size)                                                                            \
         ab_64 = a_64 * b_64;                                                                                 \
+        /* COMPMID-907 */                                                                                    \
         VEC_DATA_TYPE(int, size)                                                                             \
         ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));                                       \
         return select(ab_x2_high32, INT_MAX, overflow);                                                      \
@@ -366,4 +367,4 @@
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
-#endif // ARM_COMPUTE_HELPERS_ASYMM_H
+#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index f58e98b..5f66efb 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl

@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-/** This kernel performs reduction given an operation.
+/** This kernel performs l2 normalization on x-axis
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -42,7 +42,7 @@
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  epsilon                           Epsilon value
  */
-__kernel void l2_normalize(
+__kernel void l2_normalize_x(
     VECTOR_DECLARATION(src),
     VECTOR_DECLARATION(sum),
     VECTOR_DECLARATION(dst),
@@ -55,7 +55,104 @@
     VEC_DATA_TYPE(DATA_TYPE, 16)
     in = vload16(0, (__global DATA_TYPE *)src.ptr);
     VEC_DATA_TYPE(DATA_TYPE, 16)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
+
+    vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                           Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst),
+    DATA_TYPE epsilon)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D sum = CONVERT_TO_TENSOR3D_STRUCT(sum);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, (__global DATA_TYPE *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
 
     vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
 }
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 0000000..80b34eb
--- /dev/null
+++ b/src/core/CL/cl_kernels/memset.cl

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+    IMAGE_DECLARATION(tensor))
+{
+    Image tensor = CONVERT_TO_IMAGE_STRUCT(tensor);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = (DATA_TYPE)(CONSTANT_VALUE);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+
+#endif // Check for compile time constants

diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index dbdad27..0b6df39 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl

@@ -92,6 +92,7 @@
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
 
+#if defined(WIDTH_SIZE)
 /** Apply in-map normalization.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -133,7 +134,7 @@
 
     const int current_col = get_global_id(0) << 2;
     const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)((get_global_size(0) << 2) + 3 - 1 - current_col));
+    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
 
 #if defined(IN_MAP_2D)
     const int current_row = get_global_id(1);
@@ -168,3 +169,4 @@
 
     STORE_OP(normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
 }
+#endif // defined(WIDTH_SIZE)

diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000..a105968
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl

@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(0);
+
+    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)

diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000..925975d
--- /dev/null
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl

@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    float16 curr_mean_flt = (float16)(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 curr_std_flt = (float16)(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+    data_flt         = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+    // Perform normalization
+    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(0);
+
+    float16 curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(mean.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+    curr_mean_flt         = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(std.ptr + current_slice * VEC_SIZE * sizeof(DATA_TYPE))), float16);
+    curr_std_flt         = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    float16 data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), float16);
+    data_flt         = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+    // Perform normalization
+    float16 res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index f4f36a0..9fa540e 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #endif /* SATURATE */
 #define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
 
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
 /** Performs a pixelwise multiplication with float scale of either integer or float inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -93,3 +94,4 @@
     // Store result
     vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index c99a08a..5b3acb7 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl

@@ -32,6 +32,7 @@
 
 #define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
 
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT)
 /** Performs a pixelwise multiplication with integer scale of integer inputs.
  *
  * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
@@ -85,3 +86,70 @@
     // Perform multiplication and store result
     vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(DATA_TYPE_RES) && defined(DATA_TYPE_OUT) */
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out),
+    const float scale)
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+
+    // Dequantize
+    in_a -= (int16)(int)OFFSET_IN1;
+    in_b -= (int16)(int)OFFSET_IN2;
+    const float16 in1f32 = convert_float16(in_a) * (float16)(float)SCALE_IN1;
+    const float16 in2f32 = convert_float16(in_b) * (float16)(float)SCALE_IN2;
+
+    const float16 qresf32 = (in1f32 * in2f32 * scale) / ((float16)(float)SCALE_OUT) + ((float16)((float16)OFFSET_OUT));
+    const uchar16 res     = convert_uchar16_sat(convert_int16_rte(qresf32));
+
+    // Store result
+    vstore16(res, 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 0808353..7d15d10 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -489,7 +489,11 @@
                                    const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int start_x = get_global_id(1) * stride_x - pad_x;
+#if defined(DST_DEPTH)
+    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else  /* defined(DST_DEPTH) */
     int start_y = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
 
 #if !defined(EXCLUDE_PADDING)
     upper_bound_w += pad_x;
@@ -522,30 +526,43 @@
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void pooling_layer_MxN_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
 {
     // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#if defined(DST_DEPTH)
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
 
     VEC_DATA_TYPE(DATA_TYPE, 8)
     vdata           = INITIAL_VALUE;
     DATA_TYPE sdata = INITIAL_VALUE;
 
-    const int idx_width  = get_global_id(1) * STRIDE_X;
+    const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else  /* defined(DST_DEPTH) */
     const int idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
 
     for(int y = 0; y < POOL_SIZE_Y; ++y)
     {
@@ -555,8 +572,14 @@
             int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
             x1     = select(x1, PAD_X - idx_width - 1, y != y1);
 
+#if defined(DST_DEPTH)
+            VEC_DATA_TYPE(DATA_TYPE, 8)
+            data0 = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else  /* defined(DST_DEPTH) */
             VEC_DATA_TYPE(DATA_TYPE, 8)
             data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data0 *= data0;

diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 17d893a..198250b 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl

@@ -126,7 +126,11 @@
                              const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     int start_x = get_global_id(1) * stride_x - pad_x;
-    int start_y = get_global_id(2) * stride_y - pad_y;
+#if defined(DST_DEPTH)
+    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
+#else  /* defined(DST_DEPTH) */
+    int            start_y    = get_global_id(2) * stride_y - pad_y;
+#endif /* defined(DST_DEPTH) */
 
     const int end_x = min(start_x + pool_size_x, upper_bound_w);
     const int end_y = min(start_y + pool_size_y, upper_bound_h);
@@ -153,39 +157,58 @@
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
  * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
 __kernel void pooling_layer_MxN_quantized_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
 {
     // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#if defined(DST_DEPTH)
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D       input      = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D       output     = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* defined(DST_DEPTH) */
 
     int8 vdata = 0;
 
-    const int idx_width  = get_global_id(1) * STRIDE_X;
-    const int idx_height = get_global_id(2) * STRIDE_Y;
+    const int idx_width = get_global_id(1) * STRIDE_X;
+#if defined(DST_DEPTH)
+    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
+#else  /* defined(DST_DEPTH) */
+    const int      idx_height = get_global_id(2) * STRIDE_Y;
+#endif /* defined(DST_DEPTH) */
 
     for(int y = 0; y < POOL_SIZE_Y; ++y)
     {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
+        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
         for(int x = 0; x < POOL_SIZE_X; ++x)
         {
-            int x1      = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
-            x1          = select(x1, PAD_X - idx_width - 1, y != y1);
-            uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
-            int8 data0  = convert_int8(data);
-            vdata       = POOL_OP(vdata, data0);
+            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
+            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+
+#if defined(DST_DEPTH)
+            uchar8 data = vload8(0, (__global uchar *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
+#else  /* defined(DST_DEPTH) */
+            uchar8 data       = vload8(0, (__global uchar *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+#endif /* defined(DST_DEPTH) */
+
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
         }
     }
 

diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
new file mode 100644
index 0000000..be072ec
--- /dev/null
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl

@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/**  Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx                                  Index to write to
+ * @param[in]  center_x                             Center value of the x axis
+ * @param[in]  center_y                             Center value of the y axis
+ * @param[in]  box_width                            Prior box width
+ * @param[in]  box_height                           Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+    float xmin = (center_x - box_width / 2.f) / WIDTH;
+    float ymin = (center_y - box_height / 2.f) / HEIGHT;
+    float xmax = (center_x + box_width / 2.f) / WIDTH;
+    float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+    xmin = clamp(xmin, 0.f, 1.f);
+    ymin = clamp(ymin, 0.f, 1.f);
+    xmax = clamp(xmax, 0.f, 1.f);
+    ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+    // Store result
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min_size                             Prior box min size
+ * @param[in]  min_idx                              Index of the min vector
+ * @param[in]  idx                                  Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+    float box_width  = min_size;
+    float box_height = min_size;
+    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+    idx += 4;
+
+    if(max_size > 0)
+    {
+        box_width  = sqrt(min_size * max[min_idx]);
+        box_height = box_width;
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+    {
+        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+        {
+            continue;
+        }
+        box_width  = min_size * sqrt(aspect_ratios[i]);
+        box_height = min_size * rsqrt(aspect_ratios[i]);
+
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+
+    return idx;
+}
+
+/** Compute prior boxes and clip (NHWC)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx                                  Index to write to
+ * @param[in]  center_x                             Center value of the x axis
+ * @param[in]  center_y                             Center value of the y axis
+ * @param[in]  box_width                            Prior box width
+ * @param[in]  box_height                           Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nhwc(Tensor3D *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+    float xmin = (center_x - box_width / 2.f) / WIDTH;
+    float ymin = (center_y - box_height / 2.f) / HEIGHT;
+    float xmax = (center_x + box_width / 2.f) / WIDTH;
+    float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+    xmin = clamp(xmin, 0.f, 1.f);
+    ymin = clamp(ymin, 0.f, 1.f);
+    xmax = clamp(xmax, 0.f, 1.f);
+    ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 0, 0)) = xmin;
+    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 1, 0)) = ymin;
+    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 2, 0)) = xmax;
+    *((__global DATA_TYPE *)tensor3D_offset(out, 0, idx + 3, 0)) = ymax;
+}
+
+/** Compute prior boxes (NHWC)
+ *
+ * @param[in,out] out                Tensor output
+ * @param[in]     max                The maximum values
+ * @param[in]     aspect_ratios      The aspect ratio values
+ * @param[in]     max_size           The maximum values values size
+ * @param[in]     aspect_ratios_size The aspect ratio values size
+ * @param[in]     min_size           The minimum values size
+ * @param[in]     min_idx            Index of the min vector
+ * @param[in]     idx                Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nhwc(Tensor3D *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+    const float center_x = ((float)(get_global_id(1) % LAYER_WIDTH) + OFFSET) * STEP_X;
+    const float center_y = ((float)(get_global_id(1) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+    float box_width  = min_size;
+    float box_height = min_size;
+
+    calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+    idx += 4;
+    if(max_size > 0)
+    {
+        box_width  = sqrt(min_size * max[min_idx]);
+        box_height = box_width;
+        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+    {
+        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+        {
+            continue;
+        }
+        box_width  = min_size * sqrt(aspect_ratios[i]);
+        box_height = min_size * rsqrt(aspect_ratios[i]);
+
+        calculate_xy_min_max_nhwc(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+
+    return idx;
+}
+
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min                                  The minimum values
+ * @param[in]  max                                  The maximum_values
+ * @param[in]  aspect_ratios                        The aspect ratio values
+ * @param[in]  min_size                             The minimum values size
+ * @param[in]  max_size                             The maximum_values values size
+ * @param[in]  aspect_ratios_size                   The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+                                   unsigned int aspect_ratios_size)
+{
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    int idx = 0;
+    for(unsigned int i = 0; i < min_size; ++i)
+    {
+        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+    }
+
+    // Store variances
+    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+    {
+        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+    }
+}
+
+/** Calculate prior boxes with NHWC format.
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min                                  The minimum values
+ * @param[in]  max                                  The maximum_values
+ * @param[in]  aspect_ratios                        The aspect ratio values
+ * @param[in]  min_size                             The minimum values size
+ * @param[in]  max_size                             The maximum_values values size
+ * @param[in]  aspect_ratios_size                   The aspect ratio values size
+ */
+__kernel void prior_box_layer_nhwc(TENSOR3D_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+                                   unsigned int aspect_ratios_size)
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int idx = 0;
+    for(unsigned int i = 0; i < min_size; ++i)
+    {
+        idx = calculate_min_nhwc(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+    }
+
+    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+    {
+        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 0, 1)) = VARIANCE_0;
+        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 1, 1)) = VARIANCE_1;
+        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 2, 1)) = VARIANCE_2;
+        *((__global DATA_TYPE *)tensor3D_offset(&out, 0, i + 3, 1)) = VARIANCE_3;
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index aa7403b..d76e12a 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl

@@ -61,13 +61,14 @@
     return (in.s0 + in.s1);
 }
 
-/** This kernel performs reduction given an operation.
+/** This kernel performs parallel reduction given an operation on x-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
  *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F16/F32
  * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
@@ -81,7 +82,7 @@
  * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[in] local_sums                                Local buffer for storing the partial sum
  */
-__kernel void reduction_operation(
+__kernel void reduction_operation_x(
     IMAGE_DECLARATION(src),
     IMAGE_DECLARATION(partial_sum),
     __local DATA_TYPE *local_sums)
@@ -109,7 +110,207 @@
 
         if(lid == 0)
         {
+#if defined(MEAN) && defined(WIDTH)
+            if(y == get_local_size(1) - 1)
+            {
+                local_sums[0] /= WIDTH;
+            }
+#endif /* defined(MEAN) && defined(WIDTH) */
             ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
         }
     }
-}
\ No newline at end of file
+}
+
+#if defined(WIDTH)
+/** This kernel performs reduction on x-axis. (QASYMM8)
+ *
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_quantized_x(
+    VECTOR_DECLARATION(src),
+    VECTOR_DECLARATION(output))
+{
+    Vector src    = CONVERT_TO_VECTOR_STRUCT(src);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    uint res = 0;
+
+    for(unsigned int x = 0; x < WIDTH; ++x)
+    {
+        res += *((__global uchar *)vector_offset(&src, x));
+    }
+
+#if defined(MEAN)
+    res /= WIDTH;
+#endif /* defined(MEAN) */
+
+    // Store result
+    *((__global uchar *)output.ptr) = convert_uchar(res);
+}
+#endif /* defined(HEIGHT) */
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(output))
+{
+    Image src    = CONVERT_TO_IMAGE_STRUCT(src);
+    Image output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = 0;
+
+    for(unsigned int y = 0; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
+    }
+
+#if defined(MEAN)
+    res /= HEIGHT;
+#endif /* defined(MEAN) */
+
+    // Store result
+    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(HEIGHT) */
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = 0;
+
+    for(unsigned int z = 0; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
+    }
+
+#if defined(MEAN)
+    res /= DEPTH;
+#endif /* defined(MEAN) */
+
+    // Store result
+    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+    res = 0;
+
+    for(unsigned int w = 0; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+        in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // SQRSUM
+        res += in;
+    }
+
+#if defined(MEAN)
+    res /= BATCH;
+#endif /* defined(MEAN) */
+
+    // Store result
+    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
new file mode 100644
index 0000000..a275699
--- /dev/null
+++ b/src/core/CL/cl_kernels/reorg_layer.cl

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(0);
+    int yo = get_global_id(1);
+    int zo = get_global_id(2);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
new file mode 100644
index 0000000..f52eb18
--- /dev/null
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl

@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pw = get_global_id(2);
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2       region_start = spatial_indx * bin_size + roi_anchor;
+    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    const float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                                                                        region_start.x,
+                                                                                                        bin_size.x,
+                                                                                                        roi_bin_grid.x,
+                                                                                                        region_end.x,
+                                                                                                        region_start.y,
+                                                                                                        bin_size.y,
+                                                                                                        roi_bin_grid.y,
+                                                                                                        region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants

diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
new file mode 100644
index 0000000..3211e7e
--- /dev/null
+++ b/src/core/CL/cl_kernels/scale_quantized.cl

@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
+    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nhwc(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = get_global_id(2) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+    const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+    const float new_xf      = floor(new_x);
+    const float new_yf      = floor(new_y);
+    float       clamped_x   = clamp(new_xf, 0.0f, input_width - 1);
+    float       clamped_x1  = clamp(new_xf + 1, 0.0f, input_width - 1);
+    float       clamped_x_  = clamped_x;
+    float       clamped_x1_ = clamped_x1;
+    const float clamped_y   = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1  = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+    clamped_x1  = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1);
+    clamped_x_  = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x   = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1);
+    clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
+#endif /* BORDER_MODE_REPLICATE */
+
+    int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
+                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
+                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
+                      *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+
+    const float  a      = new_x - new_xf;
+    const float  b      = 1.f - a;
+    const float  a1     = new_y - new_yf;
+    const float  b1     = 1.f - a1;
+    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+
+    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+
+    uchar res = convert_uchar_sat(convert_int_sat_rtp(fr / SCALE) + OFFSET);
+
+    *((__global DATA_TYPE *)out.ptr) = res;
+}

diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
new file mode 100644
index 0000000..bc3df47
--- /dev/null
+++ b/src/core/CL/cl_kernels/slice_ops.cl

@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+    int offset = 0;
+
+    // Offset X
+#if defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
+#if defined(START_1) && defined(STRIDE_1)
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+    input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
+#if defined(START_2) && defined(STRIDE_2)
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+    input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
+#if defined(START_3) && defined(STRIDE_3)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+    // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 4ad8180..e549b44 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl

@@ -64,6 +64,7 @@
 
 #endif /* VECTOR_SIZE END */
 
+// TODO (COMPMID-661): Remove if the non-fused kernels are removed
 __constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
 __constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 __constant uint4 idx4   = (uint4)(0, 1, 2, 3);
@@ -344,6 +345,7 @@
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
     int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
     if(lid < boundary_workitems)
     {
@@ -459,6 +461,7 @@
         sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
     boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
     if(lid < boundary_workitems)
     {

diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index fcd1ec5..95d6d4b 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl

@@ -301,6 +301,7 @@
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     // How many work-items needed to complete the computation.
+    //TODO: Optimize this calculation (avoid %).
     int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
     if(lid < boundary_workitems)
     {
@@ -410,6 +411,7 @@
         sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
+    //TODO: Optimize the calculation (avoid %).
     boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
     if(lid < boundary_workitems)
     {

diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644
index 0000000..d42a79d
--- /dev/null
+++ b/src/core/CL/cl_kernels/space_to_batch.cl

@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software withoutput restriction, including withoutput limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KOUTD, EXPRESS OR
+ * IMPLIED, OUTCLUDOUTG BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONOUTFROUTGEMENT. OUT NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER OUT AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISOUTG FROM,
+ * OUT OF OR OUT CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALOUTGS OUT THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    {
+        const int r                      = (BATCH_SIZE / (block_x * block_y));
+        const int w                      = batch_id % r;
+        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
+        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                      Stride of the block shape tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                        block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    {
+        const int r                      = (BATCH_SIZE / (block_x * block_y));
+        const int w                      = batch_id % r;
+        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
+        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    {
+        const int r                      = (BATCH_SIZE / (block_x * block_y));
+        const int w                      = batch_id % r;
+        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
+        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    {
+        const int r                      = (BATCH_SIZE / (block_x * block_y));
+        const int w                      = batch_id % r;
+        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
+        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)

diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
new file mode 100644
index 0000000..65912f5
--- /dev/null
+++ b/src/core/CL/cl_kernels/upsample_layer.cl

@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}

diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 86a5e06..9afec7d 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h

@@ -38,6 +38,7 @@
     return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
 }
 
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
 /** Clamps the given coordinates to the borders.
  *
  * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
@@ -125,6 +126,7 @@
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
  *
  * @param[in] in     Pointer to the source image.

diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
new file mode 100644
index 0000000..48d6fae
--- /dev/null
+++ b/src/core/CL/cl_kernels/warp_helpers_quantized.h

@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
+{
+    const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size_quantized(coords, width, height, 1);
+}
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords_quantized(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords_quantized()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] border_size   Border size
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
+                                                                                    const float scale, const int offset_qasymm)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const int16 t = (int16)(
+                        /* tl, tr, bl, br */
+                        * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+                        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+                        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+                        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+                        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+                        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+                        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+                        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+
+    const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
+
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
+                          ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
+                          ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
+                          ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
+
+    const uchar4 res = convert_uchar4_sat(convert_int4_sat_rtp(fr / scale) + offset_qasymm);
+
+    return res;
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in            Pointer to the source image.
+ * @param[in] coords        Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width         Width of the image
+ * @param[in] height        Height of the image
+ * @param[in] scale         Scale value
+ * @param[in] offset_qasymm Offset value
+ */
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
+{
+    return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
+}

diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 73da005..3b9b1e9 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl

@@ -30,8 +30,9 @@
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -60,45 +61,54 @@
 
     // Load the values from the input tensor
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float3 w0 = vload3(0, (__global float *)(src_addr));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                         *((__global float *)(src_addr + 1 * src_stride_y)),
-                         *((__global float *)(src_addr + 2 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
 #else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float3 w0  = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float3 w1  = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float3 w2  = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
     // Row 0
-    float4 out0 = 0.0f;
-    out0.s0     = (w0.s0);
-    out0.s1     = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2     = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3     = (w0.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
-    float4 out1 = 0.0f;
-    out1.s0     = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1     = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2     = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3     = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
 
     // Row 2
-    float4 out2 = 0.0f;
-    out2.s0     = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1     = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2     = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3     = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
 
     // Row 3
-    float4 out3 = 0.0f;
-    out3.s0     = (w2.s0);
-    out3.s1     = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2     = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3     = (w2.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     int z  = get_global_id(2);
@@ -111,24 +121,24 @@
     // Store the values across the channels
     // 16 channels for 3x3 kernels
     // 4 channels for 3x1 or 1x3 kernels
-    *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global float *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global float *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
@@ -137,8 +147,9 @@
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -167,71 +178,82 @@
 
     // Load the values from the input tensor
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float3 w0 = vload3(0, (__global float *)(src_addr));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float3 w0 = (float3)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                         *((__global float *)(src_addr + 1 * src_stride_y)),
-                         *((__global float *)(src_addr + 2 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
 #else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float3 w0  = vload3(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float3 w1  = vload3(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float3 w2  = vload3(0, (__global float *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
     // Row 0
-    float8 out0 = 0.0f;
-    out0.s0     = (w0.s0) / 16.f;
-    out0.s1     = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
-    out0.s2     = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
-    out0.s3     = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s4     = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s5     = (w0.s2) / 4.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
-    float8 out1 = 0.0f;
-    out1.s0     = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
-    out1.s1     = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s2     = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s3     = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s4     = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s5     = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
 
     // Row 2
-    float8 out2 = 0.0f;
-    out2.s0     = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
-    out2.s1     = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s2     = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s3     = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s4     = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s5     = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
 
     // Row 3
-    float8 out3 = 0.0f;
-    out3.s0     = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out3.s1     = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s2     = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s3     = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s4     = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s5     = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
 
     // Row 4
-    float8 out4 = 0.0f;
-    out4.s0     = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out4.s1     = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s2     = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s3     = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s4     = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s5     = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
 
     // Row 5
-    float8 out5 = 0.0f;
-    out5.s0     = (w2.s0) / 4.f;
-    out5.s1     = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
-    out5.s2     = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
-    out5.s3     = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s4     = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s5     = (w2.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     int z  = get_global_id(2);
@@ -244,44 +266,44 @@
     // Store the values across the channels
     // 36 channels for 3x3 kernels
     // 6 channels for 3x1 or 1x3 kernels
-    *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
-    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s4;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s5;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out2.s0;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out2.s1;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out2.s2;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out2.s3;
-    *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s4;
-    *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s5;
-    *(__global float *)(dst_addr + 18 * dst_stride_z) = out3.s0;
-    *(__global float *)(dst_addr + 19 * dst_stride_z) = out3.s1;
-    *(__global float *)(dst_addr + 20 * dst_stride_z) = out3.s2;
-    *(__global float *)(dst_addr + 21 * dst_stride_z) = out3.s3;
-    *(__global float *)(dst_addr + 22 * dst_stride_z) = out3.s4;
-    *(__global float *)(dst_addr + 23 * dst_stride_z) = out3.s5;
-    *(__global float *)(dst_addr + 24 * dst_stride_z) = out4.s0;
-    *(__global float *)(dst_addr + 25 * dst_stride_z) = out4.s1;
-    *(__global float *)(dst_addr + 26 * dst_stride_z) = out4.s2;
-    *(__global float *)(dst_addr + 27 * dst_stride_z) = out4.s3;
-    *(__global float *)(dst_addr + 28 * dst_stride_z) = out4.s4;
-    *(__global float *)(dst_addr + 29 * dst_stride_z) = out4.s5;
-    *(__global float *)(dst_addr + 30 * dst_stride_z) = out5.s0;
-    *(__global float *)(dst_addr + 31 * dst_stride_z) = out5.s1;
-    *(__global float *)(dst_addr + 32 * dst_stride_z) = out5.s2;
-    *(__global float *)(dst_addr + 33 * dst_stride_z) = out5.s3;
-    *(__global float *)(dst_addr + 34 * dst_stride_z) = out5.s4;
-    *(__global float *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
@@ -290,8 +312,9 @@
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -320,25 +343,25 @@
 
     // Load the values from the input tensor
 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float w00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float w01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float w02 = *((__global float *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
 #else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float  w00 = *((__global float *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
-    float  w01 = *((__global float *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
-    float  w02 = *((__global float *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float  w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    float  w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    float  w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    float  w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    float  w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    float  w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     // Row 0
-    float out00, out01, out02, out03, out04, out05;
+    DATA_TYPE out00, out01, out02, out03, out04, out05;
     out00 = (w00) / 16.f;
     out01 = (-w00 - w01 - w02) / 24.f;
     out02 = (-w00 + w01 - w02) / 24.f;
@@ -348,7 +371,7 @@
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
-    float out10, out11, out12, out13, out14, out15;
+    DATA_TYPE out10, out11, out12, out13, out14, out15;
     out10 = (-w00 - w10 - w20) / 24.f;
     out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
     out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
@@ -357,7 +380,7 @@
     out15 = (-w02 - w12 - w22) / 6.f;
 
     // Row 2
-    float out20, out21, out22, out23, out24, out25;
+    DATA_TYPE out20, out21, out22, out23, out24, out25;
     out20 = (-w00 + w10 - w20) / 24.f;
     out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
     out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
@@ -366,7 +389,7 @@
     out25 = (-w02 + w12 - w22) / 6.f;
 
     // Row 3
-    float out30, out31, out32, out33, out34, out35;
+    DATA_TYPE out30, out31, out32, out33, out34, out35;
     out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
     out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
     out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
@@ -375,7 +398,7 @@
     out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
 
     // Row 4
-    float out40, out41, out42, out43, out44, out45;
+    DATA_TYPE out40, out41, out42, out43, out44, out45;
     out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
     out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
     out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
@@ -384,7 +407,7 @@
     out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
 
     // Row 5
-    float out50, out51, out52, out53, out54, out55;
+    DATA_TYPE out50, out51, out52, out53, out54, out55;
     out50 = (w20) / 4.f;
     out51 = (-w20 - w21 - w22) / 6.f;
     out52 = (-w20 + w21 - w22) / 6.f;
@@ -397,48 +420,48 @@
     int y0 = get_global_id(0); // idx channel
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
 
     // Store the values across the channels
     // 36 channels for 3x3 kernels
     // 6  channels for 3x1 or 1x3 kernels
-    *(__global float *)(dst_addr + 0 * dst_stride_z) = out00;
-    *(__global float *)(dst_addr + 1 * dst_stride_z) = out01;
-    *(__global float *)(dst_addr + 2 * dst_stride_z) = out02;
-    *(__global float *)(dst_addr + 3 * dst_stride_z) = out03;
-    *(__global float *)(dst_addr + 4 * dst_stride_z) = out04;
-    *(__global float *)(dst_addr + 5 * dst_stride_z) = out05;
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global float *)(dst_addr + 6 * dst_stride_z)  = out10;
-    *(__global float *)(dst_addr + 7 * dst_stride_z)  = out11;
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out12;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out13;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out14;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out15;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out20;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out21;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out22;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out23;
-    *(__global float *)(dst_addr + 16 * dst_stride_z) = out24;
-    *(__global float *)(dst_addr + 17 * dst_stride_z) = out25;
-    *(__global float *)(dst_addr + 18 * dst_stride_z) = out30;
-    *(__global float *)(dst_addr + 19 * dst_stride_z) = out31;
-    *(__global float *)(dst_addr + 20 * dst_stride_z) = out32;
-    *(__global float *)(dst_addr + 21 * dst_stride_z) = out33;
-    *(__global float *)(dst_addr + 22 * dst_stride_z) = out34;
-    *(__global float *)(dst_addr + 23 * dst_stride_z) = out35;
-    *(__global float *)(dst_addr + 24 * dst_stride_z) = out40;
-    *(__global float *)(dst_addr + 25 * dst_stride_z) = out41;
-    *(__global float *)(dst_addr + 26 * dst_stride_z) = out42;
-    *(__global float *)(dst_addr + 27 * dst_stride_z) = out43;
-    *(__global float *)(dst_addr + 28 * dst_stride_z) = out44;
-    *(__global float *)(dst_addr + 29 * dst_stride_z) = out45;
-    *(__global float *)(dst_addr + 30 * dst_stride_z) = out50;
-    *(__global float *)(dst_addr + 31 * dst_stride_z) = out51;
-    *(__global float *)(dst_addr + 32 * dst_stride_z) = out52;
-    *(__global float *)(dst_addr + 33 * dst_stride_z) = out53;
-    *(__global float *)(dst_addr + 34 * dst_stride_z) = out54;
-    *(__global float *)(dst_addr + 35 * dst_stride_z) = out55;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
@@ -448,8 +471,9 @@
  *
  * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -478,177 +502,192 @@
 
     // Load the values from the input tensor
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float4 w00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float  w01 = *((__global float *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
 #elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float4 w00 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                          *((__global float *)(src_addr + 1 * src_stride_y)),
-                          *((__global float *)(src_addr + 2 * src_stride_y)),
-                          *((__global float *)(src_addr + 3 * src_stride_y)));
-    float w01 = *((__global float *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
 #else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float4 w00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float  w01 = *((__global float *)(src_addr + 0 * src_stride_y) + 4);
-    float4 w10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float  w11 = *((__global float *)(src_addr + 1 * src_stride_y) + 4);
-    float4 w20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-    float  w21 = *((__global float *)(src_addr + 2 * src_stride_y) + 4);
-    float4 w30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-    float  w31 = *((__global float *)(src_addr + 3 * src_stride_y) + 4);
-    float4 w40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-    float  w41 = *((__global float *)(src_addr + 4 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
     // Transform the input tile
 
     // Row 0
-    float8 out0 = 0.0f;
-    out0.s0     = w00.s0;
-    out0.s1     = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
-    out0.s2     = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
-    out0.s3     = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s4     = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s5     = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
-    out0.s6     = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
-    out0.s7     = w01;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
-    float8 out1 = 0.0f;
-    out1.s0     = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
-    out1.s1     = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
-                         (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s2     = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
-                         (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s3     = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
-                    (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s4     = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
-                    (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s5     = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
-                    (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s6     = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
-                    (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s7     = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
 
     // Row 2
-    float8 out2 = 0.0f;
-    out2.s0     = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
-    out2.s1     = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
-                         (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s2     = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
-                         (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s3     = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
-                    (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s4     = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
-                    (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s5     = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
-                    (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s6     = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
-                    (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s7     = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
 
     // Row 3
-    float8 out3 = 0.0f;
-    out3.s0     = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out3.s1     = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                    (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                    (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s2     = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                    (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                    (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s3     = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-                   (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s4     = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-                   (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s5     = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                   (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s6     = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                   (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s7     = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
 
     // Row 4
-    float8 out4 = 0.0f;
-    out4.s0     = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out4.s1     = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                    (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                    (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s2     = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                    (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                    (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s3     = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-                   (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s4     = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-                   (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s5     = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                   (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s6     = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-                   (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                   (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s7     = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
 
     // Row 5
-    float8 out5 = 0.0f;
-    out5.s0     = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
-    out5.s1     = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                    (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                    (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s2     = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                    (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                    (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3     = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-                   (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4     = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-                   (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5     = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                   (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6     = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                   (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s7     = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
 
     // Row 6
-    float8 out6 = 0.0f;
-    out6.s0     = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
-    out6.s1     = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                    (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                    (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s2     = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                    (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                    (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3     = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-                   (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4     = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-                   (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5     = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                   (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6     = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-                   (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                   (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s7     = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
 
     // Row 7
-    float8 out7 = 0.0f;
-    out7.s0     = w40.s0;
-    out7.s1     = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
-    out7.s2     = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
-    out7.s3     = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s4     = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s5     = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
-    out7.s6     = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
-    out7.s7     = w41;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     int z  = get_global_id(2);
@@ -656,75 +695,75 @@
     int y0 = z % SRC_DIM_Z; // idx channel
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
 
     // Store the values across the channels
-    *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global float *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global float *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
@@ -733,8 +772,9 @@
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -759,242 +799,250 @@
 {
     Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
 
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(float) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Load the values from the input tensor
-    float w00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float w01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float w02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float w03 = *((__global float *)(src_addr + 3 * src_stride_z));
-    float w04 = *((__global float *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
 #else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Load the values from the input tensor
-    float w00 = *((__global float *)(src_addr + 0 * src_stride_y));
-    float w01 = *((__global float *)(src_addr + 1 * src_stride_y));
-    float w02 = *((__global float *)(src_addr + 2 * src_stride_y));
-    float w03 = *((__global float *)(src_addr + 3 * src_stride_y));
-    float w04 = *((__global float *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    float w10 = *((__global float *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    float w11 = *((__global float *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    float w12 = *((__global float *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    float w13 = *((__global float *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
-    float w14 = *((__global float *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
-    float w20 = *((__global float *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    float w21 = *((__global float *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    float w22 = *((__global float *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-    float w23 = *((__global float *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
-    float w24 = *((__global float *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
-    float w30 = *((__global float *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
-    float w31 = *((__global float *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
-    float w32 = *((__global float *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
-    float w33 = *((__global float *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
-    float w34 = *((__global float *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
-    float w40 = *((__global float *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
-    float w41 = *((__global float *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
-    float w42 = *((__global float *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
-    float w43 = *((__global float *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
-    float w44 = *((__global float *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     // Row 0
-    float8 out0 = 0.0f;
-    out0.s0     = w00;
-    out0.s1     = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
-    out0.s2     = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
-    out0.s3     = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s4     = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s5     = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
-    out0.s6     = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
-    out0.s7     = w04;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00;
+    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+    out0.s7 = w04;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
     // Row 1
-    float8 out1 = 0.0f;
-    out1.s0     = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
-    out1.s1     = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s2     = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s3     = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                    (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s4     = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                    (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s5     = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
-                    (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s6     = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
-                    (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s7     = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
 
     // Row 2
-    float8 out2 = 0.0f;
-    out2.s0     = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
-    out2.s1     = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s2     = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s3     = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                    (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s4     = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                    (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s5     = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
-                    (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s6     = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
-                    (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s7     = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
 
     // Row 3
-    float8 out3 = 0.0f;
-    out3.s0     = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
-    out3.s1     = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
-                    (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s2     = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
-                    (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s3     = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s4     = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s5     = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s6     = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s7     = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
 
     // Row 4
-    float8 out4 = 0.0f;
-    out4.s0     = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
-    out4.s1     = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
-                    (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s2     = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
-                    (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s3     = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s4     = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s5     = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s6     = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-                   (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s7     = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
 
     // Row 5
-    float8 out5 = 0.0f;
-    out5.s0     = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
-    out5.s1     = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
-                    (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s2     = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
-                    (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s3     = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s4     = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s5     = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s6     = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s7     = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
 
     // Row 6
-    float8 out6 = 0.0f;
-    out6.s0     = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
-    out6.s1     = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
-                    (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s2     = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
-                    (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s3     = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s4     = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s5     = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s6     = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-                   (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s7     = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
 
     // Row 7
-    float8 out7 = 0.0f;
-    out7.s0     = w40;
-    out7.s1     = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
-    out7.s2     = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
-    out7.s3     = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s4     = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s5     = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
-    out7.s6     = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
-    out7.s7     = w44;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40;
+    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+    out7.s7 = w44;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 
     int x0 = get_global_id(2); // idx filter
     int y0 = get_global_id(0); // idx channel
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(float) + y0 * dst_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
 
     // Store the values across the channels
-    *(__global float *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global float *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global float *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global float *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global float *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global float *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global float *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global float *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
 
 #if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global float *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global float *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global float *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global float *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global float *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global float *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global float *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global float *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global float *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global float *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global float *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global float *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global float *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global float *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global float *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global float *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global float *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global float *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global float *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global float *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global float *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global float *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global float *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global float *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global float *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global float *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global float *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global float *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global float *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global float *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global float *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global float *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global float *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global float *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global float *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global float *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global float *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global float *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global float *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global float *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global float *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global float *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global float *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global float *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global float *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global float *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global float *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global float *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global float *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global float *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global float *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global float *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global float *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global float *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global float *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global float *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 #endif // defined(SRC_DIM_Z)
@@ -1004,8 +1052,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1052,8 +1101,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1100,8 +1150,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1148,8 +1199,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1196,8 +1248,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1246,8 +1299,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1294,8 +1348,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1342,8 +1397,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1390,8 +1446,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1438,8 +1495,9 @@
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
  * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index da18e4a..34bf290 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl

@@ -52,8 +52,9 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -69,86 +70,113 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    float4 in_row0 = vload4(0, (__global float *)(src_addr));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                              *((__global float *)(src_addr + 1 * src_stride_y)),
-                              *((__global float *)(src_addr + 2 * src_stride_y)),
-                              *((__global float *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
 #else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float4       in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float4       in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float4       in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-    float4       in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
 #endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float4 tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     tmp0 -= in_row2;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float out00 = tmp0.s0 - tmp0.s2;
-    float out01 = tmp0.s1 + tmp0.s2;
-    float out02 = tmp0.s2 - tmp0.s1;
-    float out03 = tmp0.s1 - tmp0.s3;
+    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float4 tmp1 = in_row1 + in_row2;
-    float4 tmp2 = in_row2 - in_row1;
-    float4 tmp3 = in_row1 - in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
 
-    float out10 = tmp1.s0 - tmp1.s2;
-    float out11 = tmp1.s1 + tmp1.s2;
-    float out12 = tmp1.s2 - tmp1.s1;
-    float out13 = tmp1.s1 - tmp1.s3;
+    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
 
-    float out20 = tmp2.s0 - tmp2.s2;
-    float out21 = tmp2.s1 + tmp2.s2;
-    float out22 = tmp2.s2 - tmp2.s1;
-    float out23 = tmp2.s1 - tmp2.s3;
+    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
 
-    float out30 = tmp3.s0 - tmp3.s2;
-    float out31 = tmp3.s1 + tmp3.s2;
-    float out32 = tmp3.s2 - tmp3.s1;
-    float out33 = tmp3.s1 - tmp3.s3;
+    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
 
-    *((__global float *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
-    *((__global float *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
-    *((__global float *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
-    *((__global float *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 4 * dst_stride_z))  = out10;
-    *((__global float *)(dst_addr + 5 * dst_stride_z))  = out11;
-    *((__global float *)(dst_addr + 6 * dst_stride_z))  = out12;
-    *((__global float *)(dst_addr + 7 * dst_stride_z))  = out13;
-    *((__global float *)(dst_addr + 8 * dst_stride_z))  = out20;
-    *((__global float *)(dst_addr + 9 * dst_stride_z))  = out21;
-    *((__global float *)(dst_addr + 10 * dst_stride_z)) = out22;
-    *((__global float *)(dst_addr + 11 * dst_stride_z)) = out23;
-    *((__global float *)(dst_addr + 12 * dst_stride_z)) = out30;
-    *((__global float *)(dst_addr + 13 * dst_stride_z)) = out31;
-    *((__global float *)(dst_addr + 14 * dst_stride_z)) = out32;
-    *((__global float *)(dst_addr + 15 * dst_stride_z)) = out33;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
 
@@ -160,8 +188,9 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -177,107 +206,159 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2) * 2;
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    float4 in_row0 = vload4(0, (__global float *)(src_addr));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float4 in_row0 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                              *((__global float *)(src_addr + 1 * src_stride_y)),
-                              *((__global float *)(src_addr + 2 * src_stride_y)),
-                              *((__global float *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
 #else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float4       in_row0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float4       in_row1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float4       in_row2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-    float4       in_row3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
 #endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     src_addr += src_stride_z;
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    float4 in_row4 = vload4(0, (__global float *)(src_addr));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
 #elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    float4 in_row4 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                              *((__global float *)(src_addr + 1 * src_stride_y)),
-                              *((__global float *)(src_addr + 2 * src_stride_y)),
-                              *((__global float *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
 #else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float4       in_row4 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float4       in_row5 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float4       in_row6 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-    float4       in_row7 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
 #endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float4 tmp0 = in_row0;
-    float4 tmp4 = in_row4;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp4 = in_row4;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     tmp0 -= in_row2;
     tmp4 -= in_row6;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float2 out00 = (float2)(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
-    float2 out01 = (float2)(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
-    float2 out02 = (float2)(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
-    float2 out03 = (float2)(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float4 tmp1 = in_row1 + in_row2;
-    float4 tmp2 = in_row2 - in_row1;
-    float4 tmp3 = in_row1 - in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
 
-    float4 tmp5 = in_row5 + in_row6;
-    float4 tmp6 = in_row6 - in_row5;
-    float4 tmp7 = in_row5 - in_row7;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp5 = in_row5 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp6 = in_row6 - in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp7 = in_row5 - in_row7;
 
-    float2 out10 = (float2)(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
-    float2 out11 = (float2)(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
-    float2 out12 = (float2)(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
-    float2 out13 = (float2)(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
 
-    float2 out20 = (float2)(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
-    float2 out21 = (float2)(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
-    float2 out22 = (float2)(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
-    float2 out23 = (float2)(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
 
-    float2 out30 = (float2)(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
-    float2 out31 = (float2)(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
-    float2 out32 = (float2)(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
-    float2 out33 = (float2)(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
 
-    vstore2(out00, 0, (__global float *)(dst_addr + 0 * dst_stride_z));
-    vstore2(out01, 0, (__global float *)(dst_addr + 1 * dst_stride_z));
-    vstore2(out02, 0, (__global float *)(dst_addr + 2 * dst_stride_z));
-    vstore2(out03, 0, (__global float *)(dst_addr + 3 * dst_stride_z));
+    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    vstore2(out10, 0, (__global float *)(dst_addr + 4 * dst_stride_z));
-    vstore2(out11, 0, (__global float *)(dst_addr + 5 * dst_stride_z));
-    vstore2(out12, 0, (__global float *)(dst_addr + 6 * dst_stride_z));
-    vstore2(out13, 0, (__global float *)(dst_addr + 7 * dst_stride_z));
-    vstore2(out20, 0, (__global float *)(dst_addr + 8 * dst_stride_z));
-    vstore2(out21, 0, (__global float *)(dst_addr + 9 * dst_stride_z));
-    vstore2(out22, 0, (__global float *)(dst_addr + 10 * dst_stride_z));
-    vstore2(out23, 0, (__global float *)(dst_addr + 11 * dst_stride_z));
-    vstore2(out30, 0, (__global float *)(dst_addr + 12 * dst_stride_z));
-    vstore2(out31, 0, (__global float *)(dst_addr + 13 * dst_stride_z));
-    vstore2(out32, 0, (__global float *)(dst_addr + 14 * dst_stride_z));
-    vstore2(out33, 0, (__global float *)(dst_addr + 15 * dst_stride_z));
+    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
 
@@ -289,8 +370,9 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -306,40 +388,57 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute input address
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row0
-    float4 d00 = (float4)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                          *((__global float *)(src_addr + 1 * src_stride_y)),
-                          *((__global float *)(src_addr + 2 * src_stride_y)),
-                          *((__global float *)(src_addr + 3 * src_stride_y)));
-    float2 d01 = (float2)(*((__global float *)(src_addr + 4 * src_stride_y)),
-                          *((__global float *)(src_addr + 5 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
 #else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row0
-    float4       d00     = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-    float2       d01     = vload2(2, (__global float *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float out0 = 0.0f;
-    float out1 = 0.0f;
-    float out2 = 0.0f;
-    float out3 = 0.0f;
-    float out4 = 0.0f;
-    float out5 = 0.0f;
+    DATA_TYPE out0 = 0.0f;
+    DATA_TYPE out1 = 0.0f;
+    DATA_TYPE out2 = 0.0f;
+    DATA_TYPE out3 = 0.0f;
+    DATA_TYPE out4 = 0.0f;
+    DATA_TYPE out5 = 0.0f;
 
     // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
     out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
@@ -351,16 +450,18 @@
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row4
-    float4 d40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-    float2 d41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
 
     // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
-    float k0 = d41.s0;
-    float k1 = d41.s0;
-    float k2 = d41.s0;
-    float k3 = d41.s0;
-    float k4 = d41.s0;
-    float k5 = 0.0f;
+    DATA_TYPE k0 = d41.s0;
+    DATA_TYPE k1 = d41.s0;
+    DATA_TYPE k2 = d41.s0;
+    DATA_TYPE k3 = d41.s0;
+    DATA_TYPE k4 = d41.s0;
+    DATA_TYPE k5 = 0.0f;
 
     k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
     k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
@@ -377,8 +478,10 @@
     out5 += k5;
 
     // Row2
-    float4 d20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-    float2 d21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
 
     out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
     out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
@@ -389,9 +492,13 @@
 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Compute destination address
-    __global float *dst_addr = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#if defined(SRC_DEPTH)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
 
-    uint dst_plane_stride = dst_stride_z / sizeof(float);
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
 
     *(dst_addr) = out0;
     dst_addr += dst_plane_stride;
@@ -407,69 +514,73 @@
     dst_addr += dst_plane_stride;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float out6  = k0;
-    float out7  = k1;
-    float out8  = k2;
-    float out9  = k3;
-    float out10 = k4;
-    float out11 = k5;
-    float out12 = k0;
-    float out13 = k1;
-    float out14 = k2;
-    float out15 = k3;
-    float out16 = k4;
-    float out17 = k5;
-    float out18 = k0;
-    float out19 = k1;
-    float out20 = k2;
-    float out21 = k3;
-    float out22 = k4;
-    float out23 = k5;
-    float out24 = k0;
-    float out25 = k1;
-    float out26 = k2;
-    float out27 = k3;
-    float out28 = k4;
-    float out29 = k5;
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
 
     // Row1
-    float4 d10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-    float2 d11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
 
     // Row3
-    float4 d30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-    float2 d31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
 
     // Compute common parts for the channels between [6, 29]
     // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
     // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    float part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
-    float part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
-    float part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
-    float part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
-    float part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
-    float part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
-    float part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
-    float part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
-    float part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
-    float part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
-    float part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
-    float part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
 
     // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
     // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    float part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
-    float part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
-    float part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
-    float part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
-    float part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
-    float part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
-    float part18 = part6 * 0.25f; // d20.s2 - d21.s0
-    float part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
-    float part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
-    float part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
-    float part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
-    float part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
 
     out6 += part0 - part1;
     out12 += part0 + part1;
@@ -548,8 +659,10 @@
     dst_addr += dst_plane_stride;
 
     // Row5
-    float4 d50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y));
-    float2 d51 = vload2(2, (__global float *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
 
     // Channels [30, 35]
     out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
@@ -574,19 +687,17 @@
 #endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
 
-#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
  *
  * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
  * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
- * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -602,16 +713,234 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+    // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25 * in_row4;
+    tmp0 += -in_row6 + (DATA_TYPE)5.25 * in_row4 - (DATA_TYPE)5.25 * in_row2;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25 * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25 * in_row2 - (DATA_TYPE)1.25 * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5 * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5 * in_row1 - comm_fact0 + (DATA_TYPE)2.0 * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.0 * in_row1 - comm_fact0 + (DATA_TYPE)0.5 * in_row5;
+    comm_fact2 = (DATA_TYPE)4.0 * in_row2 - (DATA_TYPE)5.0 * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25 * in_row3 - (DATA_TYPE)5.25 * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(SRC_DIM_1) && defined(SRC_DIM_2)
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note Dimension one of the input tensor (width for NHWC data layout) must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM_1=112)
+ * @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
 
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float);
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
 
     // Clamp coordinates. This clamp is valid for all rows
     int4 y_coord0 = (int4)(y * OUTPUT_TILE_W) + (int4)(0, 1, 2, 3) - (int4)PAD_LEFT;
@@ -637,19 +966,19 @@
     // Clamp z coordinate
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    float d40 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d41 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d42 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d43 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d44 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d45 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d40 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d41 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d42 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d43 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d44 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d45 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
-    float k0 = d44;
-    float k1 = d44;
-    float k2 = d44;
-    float k3 = d44;
-    float k4 = d44;
-    float k5 = (float)0.0f;
+    DATA_TYPE k0 = d44;
+    DATA_TYPE k1 = d44;
+    DATA_TYPE k2 = d44;
+    DATA_TYPE k3 = d44;
+    DATA_TYPE k4 = d44;
+    DATA_TYPE k5 = (DATA_TYPE)0.0f;
 
     k0 += 4.0f * d40 - 5.0f * d42;
     k1 += -4.0f * d41 - 4.0f * d42 + d43;
@@ -674,12 +1003,12 @@
     valid_y1 = y_coord1;
 #endif // if PAD_TOP == 0, we cannot read out of bound
 
-    float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d02 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 #else  // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     int4 z_coords0 = (int4)(z * OUTPUT_TILE_H) + (int4)(0, 1, 2, 3) - (int4)PAD_TOP;
     int2 z_coords1 = (int2)(z * OUTPUT_TILE_H) + (int2)(4, 5) - (int2)PAD_TOP;
@@ -692,20 +1021,20 @@
     z_coords0 = clamp((int4)z_coords0, (int4)0, (int4)((int)SRC_DIM_2 - 1));
     z_coords1 = clamp((int2)z_coords1, (int2)0, (int2)((int)SRC_DIM_2 - 1));
 
-    float d00 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
-    float d01 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
-    float d02 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
-    float d03 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
-    float d04 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
-    float d05 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
+    DATA_TYPE d00 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coords0.s0 * src_stride_z);
+    DATA_TYPE d01 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coords0.s1 * src_stride_z);
+    DATA_TYPE d02 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coords0.s2 * src_stride_z);
+    DATA_TYPE d03 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coords0.s3 * src_stride_z);
+    DATA_TYPE d04 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coords1.s0 * src_stride_z);
+    DATA_TYPE d05 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coords1.s1 * src_stride_z);
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
-    float out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
-    float out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
-    float out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
-    float out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
-    float out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
-    float out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
+    DATA_TYPE out0 = 16.0f * d00 - 20.0f * d02 + 4.0f * d04;
+    DATA_TYPE out1 = -16.0f * d01 - 16.0f * d02 + 4.0f * d03 + 4.0f * d04;
+    DATA_TYPE out2 = 16.0f * d01 - 16.0f * d02 - 4.0f * d03 + 4.0f * d04;
+    DATA_TYPE out3 = -8.0f * d01 - 4.0f * d02 + 8.0f * d03 + 4.0f * d04;
+    DATA_TYPE out4 = 8.0f * d01 - 4.0f * d02 - 8.0f * d03 + 4.0f * d04;
+    DATA_TYPE out5 = 16.0f * d01 - 20.0f * d03 + 4.0f * d05;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
     // Row2
@@ -716,12 +1045,12 @@
     valid_y1 = select(valid_y1, (int2)SRC_DIM_1, (int2)z_coord >= (int)SRC_DIM_2);
     z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    float d20 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d21 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d22 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d23 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d24 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d25 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d20 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d21 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d22 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d23 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d24 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d25 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
     out0 += k0;
     out1 += k1;
@@ -729,30 +1058,30 @@
     out3 += k3;
     out4 += k4;
     out5 += k5;
-    float out6  = k0;
-    float out7  = k1;
-    float out8  = k2;
-    float out9  = k3;
-    float out10 = k4;
-    float out11 = k5;
-    float out12 = k0;
-    float out13 = k1;
-    float out14 = k2;
-    float out15 = k3;
-    float out16 = k4;
-    float out17 = k5;
-    float out18 = k0;
-    float out19 = k1;
-    float out20 = k2;
-    float out21 = k3;
-    float out22 = k4;
-    float out23 = k5;
-    float out24 = k0;
-    float out25 = k1;
-    float out26 = k2;
-    float out27 = k3;
-    float out28 = k4;
-    float out29 = k5;
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
 
     // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
     out0 += -20.0f * d20 + 25.0f * d22 - 5.0f * d24;
@@ -764,20 +1093,25 @@
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Compute destination address
-    __global float *dst_addr         = (__global float *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(float) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
-    uint            dst_plane_stride = dst_stride_z / sizeof(float);
+#if defined(NUM_TILES_Y)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(NUM_TILES_Y) */
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(NUM_TILES_Y) */
 
-    *((__global float *)dst_addr) = out0;
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *((__global DATA_TYPE *)dst_addr) = out0;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out1;
+    *((__global DATA_TYPE *)dst_addr) = out1;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out2;
+    *((__global DATA_TYPE *)dst_addr) = out2;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out3;
+    *((__global DATA_TYPE *)dst_addr) = out3;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out4;
+    *((__global DATA_TYPE *)dst_addr) = out4;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out5;
+    *((__global DATA_TYPE *)dst_addr) = out5;
     dst_addr += dst_plane_stride;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
@@ -787,12 +1121,12 @@
     valid_y0 = y_coord0;
     valid_y1 = y_coord1;
 
-    float d10 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d11 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d12 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d13 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d14 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d15 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d10 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d11 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d12 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d13 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d14 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d15 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row3
     z_coord  = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
@@ -803,43 +1137,43 @@
     z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
     z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    float d30 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d31 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d32 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d33 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d34 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d35 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d30 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d31 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d32 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d33 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d34 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d35 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Compute common parts for the channels between [6, 29]
     // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
     // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    float part0  = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
-    float part1  = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
-    float part2  = 16.0f * d22 - 4.0f * d24;
-    float part3  = 16.0f * d21 - 4.0f * d23;
-    float part4  = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
-    float part5  = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
-    float part6  = 4.0f * d22 - 4.0f * d24;
-    float part7  = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
-    float part8  = 4.0f * d12 - 4.0f * d14 - d32 + d34;
-    float part9  = 8.0f * d21 - 8.0f * d23;
-    float part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
-    float part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
+    DATA_TYPE part0  = -16.0f * d20 + 20.0f * d22 - 4.0f * d24;
+    DATA_TYPE part1  = 16.0f * d10 - 20.0f * d12 + 4.0f * d14 - 4.0f * d30 + 5.0f * d32 - d34;
+    DATA_TYPE part2  = 16.0f * d22 - 4.0f * d24;
+    DATA_TYPE part3  = 16.0f * d21 - 4.0f * d23;
+    DATA_TYPE part4  = 16.0f * d12 - 4.0f * d14 - 4.0f * d32 + d34;
+    DATA_TYPE part5  = 16.0f * d11 - 4.0f * d13 - 4.0f * d31 + d33;
+    DATA_TYPE part6  = 4.0f * d22 - 4.0f * d24;
+    DATA_TYPE part7  = 8.0f * d11 - 8.0f * d13 - 2.0f * d31 + 2.0f * d33;
+    DATA_TYPE part8  = 4.0f * d12 - 4.0f * d14 - d32 + d34;
+    DATA_TYPE part9  = 8.0f * d21 - 8.0f * d23;
+    DATA_TYPE part10 = -16.0f * d21 + 20.0f * d23 - 4.0f * d25;
+    DATA_TYPE part11 = -16.0f * d11 + 20.0f * d13 - 4.0f * d15 + 4.0f * d31 - 5.0f * d33 + d35;
 
     // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
     // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    float part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
-    float part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
-    float part14 = part2 * 0.25f; // 4.0f * d22 - d24
-    float part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
-    float part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
-    float part17 = part3 * 0.25f; // 4.0f * d21 - d23
-    float part18 = part6 * 0.25f; // d22 - d24
-    float part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
-    float part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
-    float part21 = part9 * 0.25f;                                        // 2.0f * (d21 - d23)
-    float part22 = part10 * 0.25f;                                       // - 4.0f * d21 + 5.0f * d23 - d25
-    float part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
+    DATA_TYPE part12 = 8.0f * d10 - 10.0f * d12 + 2.0f * d14 - 8.0f * d30 + 10.0f * d32 - 2.0f * d34;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20 + 5.0f * d22 - d24
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d22 - d24
+    DATA_TYPE part15 = 8.0f * d11 - 2.0f * d13 - 8.0f * d31 + 2.0f * d33;
+    DATA_TYPE part16 = 8.0f * d12 - 2.0f * d14 - 8.0f * d32 + 2.0f * d34;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d21 - d23
+    DATA_TYPE part18 = part6 * 0.25f; // d22 - d24
+    DATA_TYPE part19 = 4.0f * d11 - 4.0f * d13 - 4.0f * d31 + 4.0f * d33;
+    DATA_TYPE part20 = 2.0f * d12 - 2.0f * d14 - 2.0f * d32 + 2.0f * d34;
+    DATA_TYPE part21 = part9 * 0.25f;                                        // 2.0f * (d21 - d23)
+    DATA_TYPE part22 = part10 * 0.25f;                                       // - 4.0f * d21 + 5.0f * d23 - d25
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d31 - 7.5f * d33 + 1.5f * d35; // - 8.0f * d11 + 10.0f * d13 - 2.0f * d15 + 8.0f * d31 - 10.0f * d33 + 2.0f * d35;
 
     out6 += part0 - part1;
     out12 += part0 + part1;
@@ -867,54 +1201,54 @@
     out23 += part22 + part23;
     out29 += part22 - part23;
 
-    *((__global float *)dst_addr) = out6;
+    *((__global DATA_TYPE *)dst_addr) = out6;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out7;
+    *((__global DATA_TYPE *)dst_addr) = out7;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out8;
+    *((__global DATA_TYPE *)dst_addr) = out8;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out9;
+    *((__global DATA_TYPE *)dst_addr) = out9;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out10;
+    *((__global DATA_TYPE *)dst_addr) = out10;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out11;
+    *((__global DATA_TYPE *)dst_addr) = out11;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out12;
+    *((__global DATA_TYPE *)dst_addr) = out12;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out13;
+    *((__global DATA_TYPE *)dst_addr) = out13;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out14;
+    *((__global DATA_TYPE *)dst_addr) = out14;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out15;
+    *((__global DATA_TYPE *)dst_addr) = out15;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out16;
+    *((__global DATA_TYPE *)dst_addr) = out16;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out17;
+    *((__global DATA_TYPE *)dst_addr) = out17;
     dst_addr += dst_plane_stride;
 
-    *((__global float *)dst_addr) = out18;
+    *((__global DATA_TYPE *)dst_addr) = out18;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out19;
+    *((__global DATA_TYPE *)dst_addr) = out19;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out20;
+    *((__global DATA_TYPE *)dst_addr) = out20;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out21;
+    *((__global DATA_TYPE *)dst_addr) = out21;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out22;
+    *((__global DATA_TYPE *)dst_addr) = out22;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out23;
+    *((__global DATA_TYPE *)dst_addr) = out23;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out24;
+    *((__global DATA_TYPE *)dst_addr) = out24;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out25;
+    *((__global DATA_TYPE *)dst_addr) = out25;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out26;
+    *((__global DATA_TYPE *)dst_addr) = out26;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out27;
+    *((__global DATA_TYPE *)dst_addr) = out27;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out28;
+    *((__global DATA_TYPE *)dst_addr) = out28;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out29;
+    *((__global DATA_TYPE *)dst_addr) = out29;
     dst_addr += dst_plane_stride;
 
     // Row5
@@ -926,12 +1260,12 @@
     z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
     z_coord  = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    float d50 = *(__global float *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d51 = *(__global float *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    float d52 = *(__global float *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    float d53 = *(__global float *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    float d54 = *(__global float *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    float d55 = *(__global float *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d50 = *(__global DATA_TYPE *)(src_addr + valid_y0.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d51 = *(__global DATA_TYPE *)(src_addr + valid_y0.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d52 = *(__global DATA_TYPE *)(src_addr + valid_y0.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d53 = *(__global DATA_TYPE *)(src_addr + valid_y0.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d54 = *(__global DATA_TYPE *)(src_addr + valid_y1.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    DATA_TYPE d55 = *(__global DATA_TYPE *)(src_addr + valid_y1.s1 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Channels [30, 35]
     out0 = 16.0f * d10 - 20.0f * d12 - 20.0f * d30 + 25.0f * d32 + 4.0f * d50 - 5.0f * d52 + d54 + 4.0f * d14 - 5.0f * d34;
@@ -941,17 +1275,17 @@
     out4 = 8.0f * d11 - 4.0f * d12 - 8.0f * d13 - 10.0f * d31 + 5.0f * d32 + 10.0f * d33 + 2.0f * d51 - 2.0f * d53 - d52 + d54 + 4.0f * d14 - 5.0f * d34;
     out5 = 16.0f * d11 - 20.0f * d13 + 4.0f * d15 - 20.0f * d31 + 25.0f * d33 - 5.0f * d35 + 4.0f * d51 - 5.0f * d53 + d55;
 
-    *((__global float *)dst_addr) = out0;
+    *((__global DATA_TYPE *)dst_addr) = out0;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out1;
+    *((__global DATA_TYPE *)dst_addr) = out1;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out2;
+    *((__global DATA_TYPE *)dst_addr) = out2;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out3;
+    *((__global DATA_TYPE *)dst_addr) = out3;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out4;
+    *((__global DATA_TYPE *)dst_addr) = out4;
     dst_addr += dst_plane_stride;
-    *((__global float *)dst_addr) = out5;
+    *((__global DATA_TYPE *)dst_addr) = out5;
     dst_addr += dst_plane_stride;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
@@ -966,8 +1300,9 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -983,17 +1318,30 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(NUM_TILES_Y)
+    const int z = get_global_id(2) % NUM_TILES_Y;
+    const int b = get_global_id(2) / NUM_TILES_Y;
+#else  /* defined(NUM_TILES_Y) */
+    const int z = get_global_id(2);
+#endif /* defined(NUM_TILES_Y) */
 
     // Compute input address
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(float);
+#if defined(NUM_TILES_Y)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + b * src_stride_w;
+#else  /* defined(NUM_TILES_Y) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE);
+#endif /* defined(NUM_TILES_Y) */
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
     // Clamp coordinates. This clamp is valid for all rows
@@ -1005,21 +1353,25 @@
     int z_coord = z * OUTPUT_TILE_H;
 
     // Load the input tile
-    float8 in_row0;
-    in_row0.s0 = *(__global float *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s1 = *(__global float *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s2 = *(__global float *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s3 = *(__global float *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s4 = *(__global float *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s5 = *(__global float *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s6 = *(__global float *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s7 = *(__global float *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + y_coord.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + y_coord.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + y_coord.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + y_coord.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + y_coord.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + y_coord.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + y_coord.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + y_coord.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Calculate common factors for intermediate tensor
-    float8 comm_fact0 = 0.0f;
-    float8 tmp0       = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
 
-    float8 out0 = (float8)0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
 
     OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
 
@@ -1035,25 +1387,30 @@
     z_coord      = clamp(z_coord, (int8)0, (int8)SRC_DIM_2 - 1);                 // Clamp z coordinate
 
     // Load the input tile
-    float8 in_row0;
-    in_row0.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
-    in_row0.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
-    in_row0.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
-    in_row0.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
-    in_row0.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
-    in_row0.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
-    in_row0.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
-    in_row0.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0;
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord.s0 * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord.s1 * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord.s2 * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord.s3 * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord.s4 * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord.s5 * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord.s6 * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord.s7 * src_stride_z);
 
     // Calculate common factors for intermediate tensor
-    float8 comm_fact0 = 0.0f;
-    float8 tmp0       = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
 
-    float8 out0 = (float8)0.0f;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = (VEC_DATA_TYPE(DATA_TYPE, 8))0.0f;
 
     OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
 #else                                            // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    float8 in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    in_row0, in_row1, in_row2, in_row3, in_row4, in_row5, in_row6, in_row7;
 
     // Clamp coordinates. This clamp is valid for all rows
     int8 y_coord = (int8)(y * OUTPUT_TILE_W) + (int8)(0, 1, 2, 3, 4, 5, 6, 7) - (int8)PAD_LEFT;
@@ -1066,14 +1423,14 @@
     z_coord      = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);                             // Clamp z coordinate
 
     // Load the input tile
-    in_row0.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row0.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row0.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row1
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 1;
@@ -1081,14 +1438,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row1.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row1.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row1.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row2
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 2;
@@ -1096,14 +1453,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row2.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row2.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row2.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row3
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 3;
@@ -1111,14 +1468,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row3.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row3.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row3.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row4
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 4;
@@ -1126,14 +1483,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row4.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row4.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row4.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row5
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 5;
@@ -1141,14 +1498,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row5.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row5.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row5.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row6
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 6;
@@ -1156,14 +1513,14 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row6.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row6.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row6.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
     // Row7
     z_coord = (z * (int)OUTPUT_TILE_H) - (int)PAD_TOP + 7;
@@ -1171,39 +1528,43 @@
     valid_y = select(valid_y, (int8)SRC_DIM_1, (int8)z_coord >= (int)SRC_DIM_2);
     z_coord = clamp(z_coord, 0, (int)SRC_DIM_2 - 1);
 
-    in_row7.s0 = *(__global float *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s1 = *(__global float *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s2 = *(__global float *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s3 = *(__global float *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s4 = *(__global float *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s5 = *(__global float *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s6 = *(__global float *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
-    in_row7.s7 = *(__global float *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s0 = *(__global DATA_TYPE *)(src_addr + valid_y.s0 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s1 = *(__global DATA_TYPE *)(src_addr + valid_y.s1 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s2 = *(__global DATA_TYPE *)(src_addr + valid_y.s2 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s3 = *(__global DATA_TYPE *)(src_addr + valid_y.s3 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s4 = *(__global DATA_TYPE *)(src_addr + valid_y.s4 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s5 = *(__global DATA_TYPE *)(src_addr + valid_y.s5 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s6 = *(__global DATA_TYPE *)(src_addr + valid_y.s6 * (int)src_stride_y + z_coord * src_stride_z);
+    in_row7.s7 = *(__global DATA_TYPE *)(src_addr + valid_y.s7 * (int)src_stride_y + z_coord * src_stride_z);
 
-    float8 comm_fact0 = in_row2 + in_row6 - 4.25f * in_row4;
-    float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
-    float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
 
     // Calculate intermediate tensor and reuse common factor vectors
-    const float8 tmp0 = in_row0 - in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
-    const float8 tmp1 = comm_fact0 + comm_fact1;
-    const float8 tmp2 = comm_fact0 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp0 = in_row0 - in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
 
-    comm_fact0 = 2.5f * in_row3;
-    comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
+    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.f * in_row5;
 
-    const float8 tmp3 = comm_fact1 + comm_fact2;
-    const float8 tmp4 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
 
-    comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
-    comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
+    comm_fact1 = (DATA_TYPE)2.f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+    comm_fact2 = (DATA_TYPE)4.f * in_row2 - (DATA_TYPE)5.f * in_row4 + in_row6;
 
-    const float8 tmp5 = comm_fact1 + comm_fact2;
-    const float8 tmp6 = comm_fact2 - comm_fact1;
-    const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
 
     // Calculate output rows (reuse comm_fact0 vector)
-    float8 out0, out1, out2, out3, out4, out5, out6, out7;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0, out1, out2, out3, out4, out5, out6, out7;
     OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
@@ -1212,260 +1573,85 @@
     OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
     OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 
     // Store values across the channels
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(float) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#if defined(NUM_TILES_Y)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* NUM_TILES_Y */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + (y + z * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* NUM_TILES_Y */
 
-    *((__global float *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global float *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global float *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global float *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global float *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global float *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global float *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global float *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
 
 #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global float *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    // Compute input address
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(float) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(float)) - ((int)PAD_TOP * src_stride_y);
-
-    // Load input tile
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    const float8 in_row0 = vload8(0, (__global float *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    const float8 in_row0 = (float8)(*((__global float *)(src_addr + 0 * src_stride_y)),
-                                    *((__global float *)(src_addr + 1 * src_stride_y)),
-                                    *((__global float *)(src_addr + 2 * src_stride_y)),
-                                    *((__global float *)(src_addr + 3 * src_stride_y)),
-                                    *((__global float *)(src_addr + 4 * src_stride_y)),
-                                    *((__global float *)(src_addr + 5 * src_stride_y)),
-                                    *((__global float *)(src_addr + 6 * src_stride_y)),
-                                    *((__global float *)(src_addr + 7 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    const float8 in_row0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
-    const float8 in_row1 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
-    const float8 in_row2 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
-    const float8 in_row3 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
-    const float8 in_row4 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
-    const float8 in_row5 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
-    const float8 in_row6 = vload8(0, (__global float *)(src_addr + 6 * src_stride_y));
-    const float8 in_row7 = vload8(0, (__global float *)(src_addr + 7 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate common factors for intermediate tensor
-    float8 tmp0       = in_row0;
-    float8 comm_fact0 = 0.0f;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    comm_fact0 += in_row2 + in_row6 - 4.25f * in_row4;
-    tmp0 += -in_row6 + 5.25f * in_row4 - 5.25f * in_row2;
-
-    float8 comm_fact1 = in_row1 + in_row5 - 4.25f * in_row3;
-    float8 comm_fact2 = 0.25f * in_row2 - 1.25f * in_row4 + in_row6;
-
-    const float8 tmp1 = comm_fact0 + comm_fact1;
-    const float8 tmp2 = comm_fact0 - comm_fact1;
-
-    comm_fact0 = 2.5f * in_row3;
-    comm_fact1 = 0.5f * in_row1 - comm_fact0 + 2.f * in_row5;
-
-    const float8 tmp3 = comm_fact1 + comm_fact2;
-    const float8 tmp4 = comm_fact2 - comm_fact1;
-
-    comm_fact1 = 2.f * in_row1 - comm_fact0 + 0.5f * in_row5;
-    comm_fact2 = 4.f * in_row2 - 5.f * in_row4 + in_row6;
-
-    const float8 tmp5 = comm_fact1 + comm_fact2;
-    const float8 tmp6 = comm_fact2 - comm_fact1;
-    const float8 tmp7 = in_row7 - in_row1 + 5.25f * in_row3 - 5.25f * in_row5;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate output rows (reuse comm_fact0 vector)
-    float8 out0;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    float8 out1, out2, out3, out4, out5, out6, out7;
-
-    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(float) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-
-    *((__global float *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global float *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global float *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global float *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global float *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global float *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global float *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global float *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global float *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global float *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global float *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global float *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global float *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global float *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global float *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global float *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global float *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global float *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global float *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global float *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global float *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global float *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global float *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global float *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global float *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global float *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global float *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global float *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global float *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global float *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global float *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global float *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global float *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global float *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global float *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global float *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global float *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global float *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global float *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global float *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global float *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global float *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global float *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global float *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global float *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global float *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global float *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global float *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global float *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global float *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global float *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global float *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global float *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global float *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global float *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global float *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global float *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global float *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global float *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global float *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global float *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global float *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global float *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
  *
@@ -1474,8 +1660,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1491,10 +1678,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1511,7 +1702,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
@@ -1521,8 +1714,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1538,10 +1732,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
                                                  src_stride_x,
@@ -1558,7 +1756,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
@@ -1568,8 +1768,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1585,10 +1786,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1605,7 +1810,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
@@ -1615,8 +1822,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1632,10 +1840,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1652,7 +1864,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 #if defined(SRC_DIM_1) && defined(SRC_DIM_2)
@@ -1665,8 +1879,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1682,10 +1897,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
                                                  src_stride_x,
@@ -1702,7 +1921,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
@@ -1714,8 +1935,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1731,10 +1953,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
                                                  src_stride_x,
@@ -1751,9 +1977,11 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
-#endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
+#endif // defined(NUM_TILES_Y) && defined(SRC_DIM_1) && defined(SRC_DIM_2)
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
@@ -1764,8 +1992,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1781,10 +2010,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1801,7 +2034,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
@@ -1811,8 +2046,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1828,10 +2064,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
                                                  src_stride_x,
@@ -1848,7 +2088,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
@@ -1858,8 +2100,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1875,10 +2118,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1895,7 +2142,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
@@ -1905,8 +2154,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1922,10 +2172,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
                                                  src_stride_x,
@@ -1942,7 +2196,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 #if defined(SRC_DIM_1) && defined(SRC_DIM_2)
@@ -1955,8 +2211,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -1972,10 +2229,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
                                                  src_stride_x,
@@ -1992,7 +2253,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 
 /** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
@@ -2004,8 +2267,9 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
  * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -2021,10 +2285,14 @@
  * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  */
 __kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
     TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
 {
     winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
                                                  src_stride_x,
@@ -2041,7 +2309,9 @@
                                                  dst_step_y,
                                                  dst_stride_z,
                                                  dst_step_z,
-                                                 dst_offset_first_element_in_bytes);
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
 }
 #endif // defined(SRC_DIM_1) && defined(SRC_DIM_2)
 #endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)

diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index a1e7b3e..f52b027 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl

@@ -31,27 +31,32 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_2x2_3x3_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -59,15 +64,19 @@
 )
 {
     // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
-    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Compute the 2x1 or 1x2 output tile
@@ -77,20 +86,20 @@
     float out00 = d00 + d01 + d02;
     float out01 = d01 - d02 - d03;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    float d10 = *((__global float *)(src_addr + 4 * src_stride_z));
-    float d11 = *((__global float *)(src_addr + 5 * src_stride_z));
-    float d12 = *((__global float *)(src_addr + 6 * src_stride_z));
-    float d13 = *((__global float *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
 
-    float d20 = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d21 = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d22 = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d23 = *((__global float *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
 
-    float d30 = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d31 = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d32 = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d33 = *((__global float *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
 
     // Compute the 2x2 output tile
     float k0 = d01 + d11 + d21;
@@ -118,36 +127,43 @@
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
 
     out00 += (float)b;
     out01 += (float)b;
 #endif // defined(HAS_BIAS)
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
-    *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2((float2)(out00, out01), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(out00, out01), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 #if defined(HAS_BIAS)
     // Add bias
-    out10 += (float)b;
-    out11 += (float)b;
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
 #endif // defined(HAS_BIAS)
 
-    vstore2((float2)(out10, out11), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))((DATA_TYPE)out10, (DATA_TYPE)out11), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -158,27 +174,32 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x4_3x3_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -186,17 +207,21 @@
 )
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the 6x6 or 6x1 tile
-    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
-    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
-    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Compute out00, out01, out02 and out03
@@ -205,46 +230,46 @@
     float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
     float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
-    float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
-    float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
 
-    float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
-    float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
-    float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
 
-    float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
-    float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
-    float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
-    float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
-    float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
-    float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
 
-    float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
-    float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
-    float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
-    float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
-    float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
-    float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
 
-    float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
-    float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
-    float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
-    float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
-    float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
-    float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
 
     // Compute out00, out01, out02 and out03
-    float out00 = d01 + d21 + d41 + d11 + d31;
-    float out01 = d01 + d21 + d41 + d11 + d31;
-    float out02 = d01 + d21 + d41 + d11 + d31;
-    float out03 = d01 + d21 + d41 + d11 + d31;
+    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
 
     float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
     float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
@@ -301,12 +326,15 @@
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
 
     out00 += (float)b;
     out01 += (float)b;
@@ -315,16 +343,20 @@
 #endif // defined(HAS_BIAS)
 
     // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
-    *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
-    *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
-    *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = (DATA_TYPE)out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = (DATA_TYPE)out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = (DATA_TYPE)out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = (DATA_TYPE)out03;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out00, (DATA_TYPE)out01, (DATA_TYPE)out02, (DATA_TYPE)out03), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -345,9 +377,9 @@
     out32 += (float)b;
     out33 += (float)b;
 #endif // defined(HAS_BIAS)
-    vstore4((float4)(out10, out11, out12, out13), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
-    vstore4((float4)(out20, out21, out22, out23), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
-    vstore4((float4)(out30, out31, out32, out33), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out10, (DATA_TYPE)out11, (DATA_TYPE)out12, (DATA_TYPE)out13), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out20, (DATA_TYPE)out21, (DATA_TYPE)out22, (DATA_TYPE)out23), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out30, (DATA_TYPE)out31, (DATA_TYPE)out32, (DATA_TYPE)out33), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -358,45 +390,54 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
  */
 __kernel void winograd_output_transform_4x4_3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
     int dst_size)
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
-    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
-    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Compute out00, out01, out02 and out03
@@ -406,40 +447,40 @@
     float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
-    float d10 = *((__global float *)(src_addr + 6 * src_stride_z));
-    float d11 = *((__global float *)(src_addr + 7 * src_stride_z));
-    float d12 = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d13 = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d14 = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d15 = *((__global float *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
 
-    float d20 = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d21 = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d22 = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d23 = *((__global float *)(src_addr + 15 * src_stride_z));
-    float d24 = *((__global float *)(src_addr + 16 * src_stride_z));
-    float d25 = *((__global float *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
 
-    float d30 = *((__global float *)(src_addr + 18 * src_stride_z));
-    float d31 = *((__global float *)(src_addr + 19 * src_stride_z));
-    float d32 = *((__global float *)(src_addr + 20 * src_stride_z));
-    float d33 = *((__global float *)(src_addr + 21 * src_stride_z));
-    float d34 = *((__global float *)(src_addr + 22 * src_stride_z));
-    float d35 = *((__global float *)(src_addr + 23 * src_stride_z));
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
 
-    float d40 = *((__global float *)(src_addr + 24 * src_stride_z));
-    float d41 = *((__global float *)(src_addr + 25 * src_stride_z));
-    float d42 = *((__global float *)(src_addr + 26 * src_stride_z));
-    float d43 = *((__global float *)(src_addr + 27 * src_stride_z));
-    float d44 = *((__global float *)(src_addr + 28 * src_stride_z));
-    float d45 = *((__global float *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
 
-    float d50 = *((__global float *)(src_addr + 30 * src_stride_z));
-    float d51 = *((__global float *)(src_addr + 31 * src_stride_z));
-    float d52 = *((__global float *)(src_addr + 32 * src_stride_z));
-    float d53 = *((__global float *)(src_addr + 33 * src_stride_z));
-    float d54 = *((__global float *)(src_addr + 34 * src_stride_z));
-    float d55 = *((__global float *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
 
     // Compute out00, out01, out02 and out03
     float out00 = d01 + d21 + d41 + d11 + d31;
@@ -502,77 +543,88 @@
     int x_out = get_global_id(0);
     int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
 #if defined(HAS_BIAS)
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+    DATA_TYPE b = (DATA_TYPE) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
 
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
 #if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    out10 += (float)b;
-    out11 += (float)b;
-    out12 += (float)b;
-    out13 += (float)b;
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+    out12 += (DATA_TYPE)b;
+    out13 += (DATA_TYPE)b;
 
-    out20 += (float)b;
-    out21 += (float)b;
-    out22 += (float)b;
-    out23 += (float)b;
+    out20 += (DATA_TYPE)b;
+    out21 += (DATA_TYPE)b;
+    out22 += (DATA_TYPE)b;
+    out23 += (DATA_TYPE)b;
 
-    out30 += (float)b;
-    out31 += (float)b;
-    out32 += (float)b;
-    out33 += (float)b;
+    out30 += (DATA_TYPE)b;
+    out31 += (DATA_TYPE)b;
+    out32 += (DATA_TYPE)b;
+    out33 += (DATA_TYPE)b;
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #endif // defined(HAS_BIAS)
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
     // Store the 1x4 output tile
-    *((__global float *)(dst_ptr + offset.s0)) = out00;
-    *((__global float *)(dst_ptr + offset.s1)) = out01;
-    *((__global float *)(dst_ptr + offset.s2)) = out02;
-    *((__global float *)(dst_ptr + offset.s3)) = out03;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = (DATA_TYPE)out00;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = (DATA_TYPE)out01;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = (DATA_TYPE)out02;
+    *((__global DATA_TYPE *)(dst_ptr + offset.s3)) = (DATA_TYPE)out03;
 #elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Store the 4x1 output tile
-    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
     int mult_y = min(dst_size - offset, 1);
 
-    *((__global float *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;
-    *((__global float *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;
-    *((__global float *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;
-    *((__global float *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = (DATA_TYPE)out00;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = (DATA_TYPE)out01;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = (DATA_TYPE)out02;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = (DATA_TYPE)out03;
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
     // Get output address
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
     offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
-    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                       // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
+    int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
     // Store the 4x4 output tile
-    *((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;
-    *((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;
-    *((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;
-    *((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;
-    *((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;
-    *((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;
-    *((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;
-    *((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;
-    *((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;
-    *((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;
-    *((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;
-    *((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;
-    *((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;
-    *((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;
-    *((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;
-    *((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = (DATA_TYPE)out00;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = (DATA_TYPE)out01;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = (DATA_TYPE)out02;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = (DATA_TYPE)out03;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = (DATA_TYPE)out10;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = (DATA_TYPE)out11;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = (DATA_TYPE)out12;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = (DATA_TYPE)out13;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = (DATA_TYPE)out20;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = (DATA_TYPE)out21;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = (DATA_TYPE)out22;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = (DATA_TYPE)out23;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = (DATA_TYPE)out30;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = (DATA_TYPE)out31;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = (DATA_TYPE)out32;
+    *((__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = (DATA_TYPE)out33;
 
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
 }
@@ -601,27 +653,32 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x4_5x5_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -629,27 +686,38 @@
 )
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     // Compute output address
     int y_in  = get_global_id(1);
     int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
     int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the input tile
-    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
-    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
-    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
-    float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
-    float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Compute out00, out01, out02 and out03
@@ -662,91 +730,93 @@
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
 
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
 #endif // defined(HAS_BIAS)
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;
-    *((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;
-    *((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;
-    *((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out03;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(out00, out01, out02, out03), 0, (__global DATA_TYPE *)(dst_addr));
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    float d10                                                           = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d11                                                           = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d12                                                           = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d13                                                           = *((__global float *)(src_addr + 11 * src_stride_z));
-    float d14                                                           = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d15                                                           = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d16                                                           = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d17                                                           = *((__global float *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
 
-    float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
-    float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
-    float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
-    float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
-    float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
-    float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
-    float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
-    float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
 
-    float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
-    float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
-    float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
-    float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
-    float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
-    float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
-    float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
-    float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
 
-    float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
-    float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
-    float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
-    float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
-    float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
-    float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
-    float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
-    float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
 
-    float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
-    float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
-    float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
-    float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
-    float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
-    float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
-    float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
-    float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
 
-    float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
-    float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
-    float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
-    float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
-    float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
-    float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
-    float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
-    float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
 
-    float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
-    float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
-    float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
-    float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
-    float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
-    float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
-    float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
-    float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
 
     // Compute the 8x4 intermediate tensor
-    float4 comm_fact0, comm_fact1, comm_fact2;
-    float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
 
     COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
     COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
@@ -762,33 +832,37 @@
     comm_fact1 = tmp_col3 + tmp_col4;
     comm_fact2 = tmp_col5 + tmp_col6;
 
-    float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
-    float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
 
     comm_fact0 = tmp_col1 - tmp_col2;
     comm_fact1 = tmp_col3 - tmp_col4;
     comm_fact2 = tmp_col5 - tmp_col6;
 
-    float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
-    float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
 
 #if defined(HAS_BIAS)
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
 
-    out_col0 += (float4)b;
-    out_col1 += (float4)b;
-    out_col2 += (float4)b;
-    out_col3 += (float4)b;
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
 #endif // defined(HAS_BIAS)
 
     // Store the output tile
-    vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));
-    vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));
-    vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));
-    vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s0, (DATA_TYPE)out_col1.s0, (DATA_TYPE)out_col2.s0, (DATA_TYPE)out_col3.s0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s1, (DATA_TYPE)out_col1.s1, (DATA_TYPE)out_col2.s1, (DATA_TYPE)out_col3.s1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s2, (DATA_TYPE)out_col1.s2, (DATA_TYPE)out_col2.s2, (DATA_TYPE)out_col3.s2), 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))((DATA_TYPE)out_col0.s3, (DATA_TYPE)out_col1.s3, (DATA_TYPE)out_col2.s3, (DATA_TYPE)out_col3.s3), 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -799,51 +873,63 @@
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
  * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x4_5x5_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
     int dst_size)
 {
     // Each thread stores a 4x4/4x1 or 1x4 tile
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
     const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
 
     int y_in  = get_global_id(1);
     int x_out = get_global_id(0);
     int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
     int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
 
     // Load the values across the channels to compose the input tile
-    float d00 = *((__global float *)(src_addr + 0 * src_stride_z));
-    float d01 = *((__global float *)(src_addr + 1 * src_stride_z));
-    float d02 = *((__global float *)(src_addr + 2 * src_stride_z));
-    float d03 = *((__global float *)(src_addr + 3 * src_stride_z));
-    float d04 = *((__global float *)(src_addr + 4 * src_stride_z));
-    float d05 = *((__global float *)(src_addr + 5 * src_stride_z));
-    float d06 = *((__global float *)(src_addr + 6 * src_stride_z));
-    float d07 = *((__global float *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Compute out00, out01, out02 and out03
@@ -856,7 +942,7 @@
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
 
     out00 += (float)b;
     out01 += (float)b;
@@ -867,91 +953,97 @@
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Get output address
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
-    offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else                                                                               /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif                                                                              /* defined(SRC_DEPTH) */
+    offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
-    *(__global float *)(dst_ptr + offset.s0) = out00;
-    *(__global float *)(dst_ptr + offset.s1) = out01;
-    *(__global float *)(dst_ptr + offset.s2) = out02;
-    *(__global float *)(dst_ptr + offset.s3) = out03;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s0) = (DATA_TYPE)out00;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s1) = (DATA_TYPE)out01;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s2) = (DATA_TYPE)out02;
+    *(__global DATA_TYPE *)(dst_ptr + offset.s3) = (DATA_TYPE)out03;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     // Get output address
-    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;
+    int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
 
-    *(__global float *)(dst_ptr + 0 * dst_stride_y + offset) = out00;
-    *(__global float *)(dst_ptr + 1 * dst_stride_y + offset) = out01;
-    *(__global float *)(dst_ptr + 2 * dst_stride_y + offset) = out02;
-    *(__global float *)(dst_ptr + 3 * dst_stride_y + offset) = out03;
+    *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = (DATA_TYPE)out00;
+    *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = (DATA_TYPE)out01;
+    *(__global DATA_TYPE *)(dst_ptr + 2 * dst_stride_y + offset) = (DATA_TYPE)out02;
+    *(__global DATA_TYPE *)(dst_ptr + 3 * dst_stride_y + offset) = (DATA_TYPE)out03;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
 #else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 
-    float d10 = *((__global float *)(src_addr + 8 * src_stride_z));
-    float d11 = *((__global float *)(src_addr + 9 * src_stride_z));
-    float d12 = *((__global float *)(src_addr + 10 * src_stride_z));
-    float d13 = *((__global float *)(src_addr + 11 * src_stride_z));
-    float d14 = *((__global float *)(src_addr + 12 * src_stride_z));
-    float d15 = *((__global float *)(src_addr + 13 * src_stride_z));
-    float d16 = *((__global float *)(src_addr + 14 * src_stride_z));
-    float d17 = *((__global float *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
 
-    float d20 = *((__global float *)(src_addr + 16 * src_stride_z));
-    float d21 = *((__global float *)(src_addr + 17 * src_stride_z));
-    float d22 = *((__global float *)(src_addr + 18 * src_stride_z));
-    float d23 = *((__global float *)(src_addr + 19 * src_stride_z));
-    float d24 = *((__global float *)(src_addr + 20 * src_stride_z));
-    float d25 = *((__global float *)(src_addr + 21 * src_stride_z));
-    float d26 = *((__global float *)(src_addr + 22 * src_stride_z));
-    float d27 = *((__global float *)(src_addr + 23 * src_stride_z));
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
 
-    float d30 = *((__global float *)(src_addr + 24 * src_stride_z));
-    float d31 = *((__global float *)(src_addr + 25 * src_stride_z));
-    float d32 = *((__global float *)(src_addr + 26 * src_stride_z));
-    float d33 = *((__global float *)(src_addr + 27 * src_stride_z));
-    float d34 = *((__global float *)(src_addr + 28 * src_stride_z));
-    float d35 = *((__global float *)(src_addr + 29 * src_stride_z));
-    float d36 = *((__global float *)(src_addr + 30 * src_stride_z));
-    float d37 = *((__global float *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
 
-    float d40 = *((__global float *)(src_addr + 32 * src_stride_z));
-    float d41 = *((__global float *)(src_addr + 33 * src_stride_z));
-    float d42 = *((__global float *)(src_addr + 34 * src_stride_z));
-    float d43 = *((__global float *)(src_addr + 35 * src_stride_z));
-    float d44 = *((__global float *)(src_addr + 36 * src_stride_z));
-    float d45 = *((__global float *)(src_addr + 37 * src_stride_z));
-    float d46 = *((__global float *)(src_addr + 38 * src_stride_z));
-    float d47 = *((__global float *)(src_addr + 39 * src_stride_z));
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
 
-    float d50 = *((__global float *)(src_addr + 40 * src_stride_z));
-    float d51 = *((__global float *)(src_addr + 41 * src_stride_z));
-    float d52 = *((__global float *)(src_addr + 42 * src_stride_z));
-    float d53 = *((__global float *)(src_addr + 43 * src_stride_z));
-    float d54 = *((__global float *)(src_addr + 44 * src_stride_z));
-    float d55 = *((__global float *)(src_addr + 45 * src_stride_z));
-    float d56 = *((__global float *)(src_addr + 46 * src_stride_z));
-    float d57 = *((__global float *)(src_addr + 47 * src_stride_z));
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
 
-    float d60 = *((__global float *)(src_addr + 48 * src_stride_z));
-    float d61 = *((__global float *)(src_addr + 49 * src_stride_z));
-    float d62 = *((__global float *)(src_addr + 50 * src_stride_z));
-    float d63 = *((__global float *)(src_addr + 51 * src_stride_z));
-    float d64 = *((__global float *)(src_addr + 52 * src_stride_z));
-    float d65 = *((__global float *)(src_addr + 53 * src_stride_z));
-    float d66 = *((__global float *)(src_addr + 54 * src_stride_z));
-    float d67 = *((__global float *)(src_addr + 55 * src_stride_z));
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
 
-    float d70 = *((__global float *)(src_addr + 56 * src_stride_z));
-    float d71 = *((__global float *)(src_addr + 57 * src_stride_z));
-    float d72 = *((__global float *)(src_addr + 58 * src_stride_z));
-    float d73 = *((__global float *)(src_addr + 59 * src_stride_z));
-    float d74 = *((__global float *)(src_addr + 60 * src_stride_z));
-    float d75 = *((__global float *)(src_addr + 61 * src_stride_z));
-    float d76 = *((__global float *)(src_addr + 62 * src_stride_z));
-    float d77 = *((__global float *)(src_addr + 63 * src_stride_z));
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
 
     // Compute the 8x4 intermediate tensor
-    float4 comm_fact0, comm_fact1, comm_fact2;
-    float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
 
     COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
     COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
@@ -967,49 +1059,57 @@
     comm_fact1 = tmp_col3 + tmp_col4;
     comm_fact2 = tmp_col5 + tmp_col6;
 
-    float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
-    float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;
 
     comm_fact0 = tmp_col1 - tmp_col2;
     comm_fact1 = tmp_col3 - tmp_col4;
     comm_fact2 = tmp_col5 - tmp_col6;
 
-    float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
-    float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;
 
 #if defined(HAS_BIAS)
     // Add bias
     Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
 
-    float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));
+    DATA_TYPE b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, x_out)));
 
-    out_col0 += (float4)b;
-    out_col1 += (float4)b;
-    out_col2 += (float4)b;
-    out_col3 += (float4)b;
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
 #endif // defined(HAS_BIAS)
     // Get output address
-    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);
+#if defined(SRC_DEPTH)
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z);
+#endif /* defined(SRC_DEPTH) */
     offset      = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
     int4 mult_y = min((int4)dst_size - offset, (int4)1);                                 // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.
 
     // Store the output tile
-    *(__global float *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = out_col0.s0;
-    *(__global float *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = out_col1.s0;
-    *(__global float *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = out_col2.s0;
-    *(__global float *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = out_col3.s0;
-    *(__global float *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = out_col0.s1;
-    *(__global float *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = out_col1.s1;
-    *(__global float *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = out_col2.s1;
-    *(__global float *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = out_col3.s1;
-    *(__global float *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = out_col0.s2;
-    *(__global float *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = out_col1.s2;
-    *(__global float *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = out_col2.s2;
-    *(__global float *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = out_col3.s2;
-    *(__global float *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = out_col0.s3;
-    *(__global float *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = out_col1.s3;
-    *(__global float *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = out_col2.s3;
-    *(__global float *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = out_col3.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col0.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col1.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 2 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col2.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 3 * (int)dst_stride_y + offset.s0) = (DATA_TYPE)out_col3.s0;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 0 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col0.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 1 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col1.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 2 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col2.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s1 * 3 * (int)dst_stride_y + offset.s1) = (DATA_TYPE)out_col3.s1;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 0 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col0.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 1 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col1.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 2 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col2.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s2 * 3 * (int)dst_stride_y + offset.s2) = (DATA_TYPE)out_col3.s2;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 0 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col0.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 1 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col1.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 2 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col2.s3;
+    *(__global DATA_TYPE *)(dst_ptr + mult_y.s3 * 3 * (int)dst_stride_y + offset.s3) = (DATA_TYPE)out_col3.s3;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
@@ -1020,27 +1120,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_2x1_3x1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1054,6 +1159,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1062,6 +1169,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1079,27 +1188,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x1_3x1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1113,6 +1227,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1121,6 +1237,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1138,27 +1256,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x1_5x1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1172,6 +1295,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1180,6 +1305,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1197,27 +1324,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x1_3x1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
@@ -1230,6 +1362,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1238,6 +1372,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes,
 #if defined(HAS_BIAS)
                                            bias_ptr,
@@ -1254,27 +1390,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_4x1_5x1_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
@@ -1287,6 +1428,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1295,6 +1438,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes,
 #if defined(HAS_BIAS)
                                            bias_ptr,
@@ -1313,27 +1458,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_1x2_1x3_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1347,6 +1497,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1355,6 +1507,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1372,27 +1526,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_1x4_1x3_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1406,6 +1565,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1414,6 +1575,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1431,27 +1594,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_1x4_1x5_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
 #if defined(HAS_BIAS)
     ,
     VECTOR_DECLARATION(bias)
@@ -1465,6 +1633,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1473,6 +1643,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes
 #if defined(HAS_BIAS)
                                            ,
@@ -1490,27 +1662,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_1x4_1x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
@@ -1523,6 +1700,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1531,6 +1710,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes,
 #if defined(HAS_BIAS)
                                            bias_ptr,
@@ -1547,27 +1728,32 @@
  * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
  * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
  * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void winograd_output_transform_1x4_1x5_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
 #if defined(HAS_BIAS)
     VECTOR_DECLARATION(bias),
 #endif // defined(HAS_BIAS)
@@ -1580,6 +1766,8 @@
                                            src_step_y,
                                            src_stride_z,
                                            src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
                                            src_offset_first_element_in_bytes,
                                            dst_ptr,
                                            dst_stride_x,
@@ -1588,6 +1776,8 @@
                                            dst_step_y,
                                            dst_stride_z,
                                            dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
                                            dst_offset_first_element_in_bytes,
 #if defined(HAS_BIAS)
                                            bias_ptr,

diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
new file mode 100644
index 0000000..2240d7c
--- /dev/null
+++ b/src/core/CL/cl_kernels/yolo_layer.cl

@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACT) && defined(NUM_CLASSES) && defined(VEC_SIZE)
+
+#if VEC_SIZE != 1
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SELECT_TYPE VEC_DATA_TYPE(SELECT_DATA_TYPE, VEC_SIZE)
+
+#include "activation_helpers.h"
+
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nchw(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    const int  box_ch_id = get_global_id(2) % (NUM_CLASSES + 5);
+    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+    if(activate)
+    {
+        // Load data
+        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+        data      = ACTIVATION_OP(ACT, data); // select(1.0f, ACTIVATION_OP(ACT, data), (SELECT_TYPE)activate);
+
+        // Store result
+        VSTORE(VEC_SIZE)
+        (data, 0, (__global DATA_TYPE *)output.ptr);
+    }
+#ifndef IN_PLACE
+    else
+    {
+        // Load data
+        TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+        // Store result
+        VSTORE(VEC_SIZE)
+        (data, 0, (__global DATA_TYPE *)output.ptr);
+    }
+#endif // IN_PLACE
+}
+
+#else // VEC_SIZE != 1
+
+#define TYPE DATA_TYPE
+#define SELECT_TYPE SELECT_DATA_TYPE
+
+#include "activation_helpers.h"
+
+/** This performs a YOLO partial activation function for NCHW data layout
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=1
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note The number of classes should be given as a preprocessor argument using -DNUM_CLASSES=num. e.g. -DNUM_CLASSES=80
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void yolo_layer_nhwc(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    const int  box_ch_id = get_global_id(0) % (NUM_CLASSES + 5);
+    const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+    if(activate)
+    {
+        // Load data
+        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+        data           = select(data, ACTIVATION_OP(ACT, data), (SELECT_TYPE)activate);
+
+        // Store result
+        *((__global DATA_TYPE *)output.ptr) = data;
+    }
+#ifndef IN_PLACE
+    else
+    {
+        // Load data
+        DATA_TYPE data = *((__global DATA_TYPE *)input.ptr);
+
+        // Store result
+        *((__global DATA_TYPE *)output.ptr) = data;
+    }
+#endif // IN_PLACE
+}
+
+#endif // VEC_SIZE != 1
+#endif // defined(DATA_TYPE) && defined(SELECT_DATA_TYPE) && defined(ACT) && defined(NUM_CLASSES) && defined(VEC_SIZE)
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index a15e99b..73a4d7d 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -133,6 +133,7 @@
     std::set<std::string> build_opts;
     build_opts.emplace(("-DACT=" + lower_string(string_from_activation_func(act_info.activation()))));
     build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.emplace(("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt)));
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
     if(is_data_type_quantized(dt))

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 2372d45..10d7fd4 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp

@@ -31,7 +31,7 @@
 
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
+constexpr unsigned int num_elems_processed_per_iteration = 8;
 
 Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
 {
@@ -140,6 +140,7 @@
     build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
     build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
     {
         build_opts.emplace("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
@@ -155,11 +156,22 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += (policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+    _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
 Status CLArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
@@ -176,8 +188,9 @@
     const TensorShape &in_shape2 = _input2->info()->tensor_shape();
     const TensorShape &out_shape = _output->info()->tensor_shape();
 
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    bool       can_collapse = true;
+    const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
         for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
@@ -204,7 +217,7 @@
         add_3D_tensor_argument(idx, _input2, slice_input2);
         add_3D_tensor_argument(idx, _output, slice);
 
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, lws_hint());
 
         collapsed.slide_window_slice_3D(slice_input1);
         collapsed.slide_window_slice_3D(slice_input2);

diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index 299ac55..95d2011 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp

@@ -36,45 +36,82 @@
 #include <set>
 #include <string>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+    const bool is_qasymm = is_data_type_quantized_asymmetric(input1.data_type()) || is_data_type_quantized_asymmetric(input2.data_type());
+    if(is_qasymm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
+    }
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if((output != nullptr) && (output->total_size() != 0))
+    if(output.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
                                         "Output can only be U8 if both inputs are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+        if(is_qasymm)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &output);
+        }
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(output, out_shape);
 
-    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+        {
+            set_format_if_unknown(output, Format::S16);
+        }
+        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+        {
+            set_format_if_unknown(output, Format::F16);
+        }
+        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+        {
+            set_format_if_unknown(output, Format::F32);
+        }
+    }
 
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
     output_access.set_valid_region(win, valid_region);
 
@@ -91,22 +128,11 @@
 void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
     _input2 = input2;
@@ -114,26 +140,39 @@
 
     bool has_float_out = is_data_type_float(output->info()->data_type());
 
+    // Setup kernel
+    std::string kernel_name = "arithmetic_sub";
+
     // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
-    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    CLBuildOptions build_opts;
+    build_opts.add_option_if_else(policy == ConvertPolicy::WRAP || has_float_out, "-DWRAP", "-DSATURATE");
+    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    if(is_data_type_quantized_asymmetric(input1->info()->data_type()))
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+        kernel_name += "_quantized";
+    }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 }
 
 Status CLArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -143,16 +182,51 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
+    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
+    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
+    const TensorShape &out_shape = _output->info()->tensor_shape();
+
+    // Collapse only if broadcast dimensions is less than 2, or in case of no broadcasting
+    bool can_collapse = true;
+    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input1, slice);
-        add_3D_tensor_argument(idx, _input2, slice);
+
+        add_3D_tensor_argument(idx, _input1, slice_input1);
+        add_3D_tensor_argument(idx, _input2, slice_input2);
         add_3D_tensor_argument(idx, _output, slice);
+
         enqueue(queue, *this, slice);
+
+        collapsed.slide_window_slice_3D(slice_input1);
+        collapsed.slide_window_slice_3D(slice_input2);
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+
+BorderSize CLArithmeticSubtractionKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index d4a7207..07bcb75 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -159,6 +159,7 @@
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option_if(act_info.enabled(), "-DFUSED_ACTIVATION=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -192,8 +193,6 @@
     ICLKernel::configure_internal(win_config.second);
 
     _config_id = "batch_normalization_layer_";
-    _config_id += string_from_data_layout(input->info()->data_layout());
-    _config_id += "_";
     _config_id += string_from_data_type(input->info()->data_type());
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(0));

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
new file mode 100644
index 0000000..58a8d10
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp

@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_y <= 0);
+
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+        const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape_x * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape_x * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] != input->tensor_shape()[idx_channel]);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
+    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+{
+}
+
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
+
+    _input       = input;
+    _block_shape = block_shape;
+    _output      = output;
+
+    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICLKernel::configure_internal(win);
+}
+
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    TensorShape output_shape = compute_batch_to_space_shape(input->info(), block_shape_x, block_shape_y);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info()));
+
+    _input  = input;
+    _output = output;
+
+    const int idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(input->info()->dimension(3)));
+    build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
+    build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+    ICLKernel::configure_internal(win);
+}
+
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
+    return Status{};
+}
+
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output));
+    return Status{};
+}
+
+void CLBatchToSpaceLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice_in  = window.first_slice_window_3D();
+    Window slice_out = window.first_slice_window_4D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_out.set(3, Window::Dimension(0, 0, 0));
+
+    int batch_id = 0;
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_argument(idx, batch_id);
+        if(_block_shape != nullptr)
+        {
+            add_1D_tensor_argument(idx, _block_shape, vector_slice);
+        }
+        add_4D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_in);
+
+        ++batch_id;
+    }
+    while(window.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
new file mode 100644
index 0000000..bff28e3
--- /dev/null
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp

@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(boxes, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(deltas, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[1] != boxes->tensor_shape()[1]);
+    ARM_COMPUTE_RETURN_ERROR_ON(deltas->tensor_shape()[0] % 4 != 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(deltas->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
+
+    if(pred_boxes->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
+        ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
+    return Status{};
+}
+} // namespace
+
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
+    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+{
+}
+
+void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
+    auto_init_if_empty(*pred_boxes->info(), *deltas->info());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
+
+    // Set instance variables
+    _boxes      = boxes;
+    _pred_boxes = pred_boxes;
+    _deltas     = deltas;
+
+    // Get image height and widht (rescaled)
+    const int img_h = floor(info.img_height() / info.scale() + 0.5f);
+    const int img_w = floor(info.img_width() / info.scale() + 0.5f);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(deltas->info()->data_type()));
+    build_opts.add_option("-DWEIGHT_X=" + float_to_string_with_full_precision(info.weights()[0]));
+    build_opts.add_option("-DWEIGHT_Y=" + float_to_string_with_full_precision(info.weights()[1]));
+    build_opts.add_option("-DWEIGHT_W=" + float_to_string_with_full_precision(info.weights()[2]));
+    build_opts.add_option("-DWEIGHT_H=" + float_to_string_with_full_precision(info.weights()[3]));
+    build_opts.add_option("-DBBOX_XFORM_CLIP=" + float_to_string_with_full_precision(info.bbox_xform_clip()));
+    build_opts.add_option("-DIMG_WIDTH=" + support::cpp11::to_string(img_w));
+    build_opts.add_option("-DIMG_HEIGHT=" + support::cpp11::to_string(img_h));
+    build_opts.add_option("-DBOX_FIELDS=" + support::cpp11::to_string(4));
+    build_opts.add_option("-DSCALE_BEFORE=" + float_to_string_with_full_precision(info.scale()));
+    build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
+    build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bounding_box_transform", build_opts.options()));
+
+    // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
+    const unsigned int num_elems_processed_per_iteration = 4;
+    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+}
+
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
+    return Status{};
+}
+
+void CLBoundingBoxTransformKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice = window.first_slice_window_2D();
+
+    // Set arguments
+    unsigned int idx = 0;
+    add_1D_tensor_argument(idx, _boxes, slice);
+    add_2D_tensor_argument(idx, _pred_boxes, slice);
+    add_2D_tensor_argument(idx, _deltas, slice);
+
+    // Note that we don't need to loop over the slices, as we are sure that we are dealing with all 2D tensors
+    enqueue(queue, *this, slice);
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index be4d687..53a5456 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -67,18 +67,22 @@
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
+    const bool             is_nhwc                             = input->data_layout() == DataLayout::NHWC;
+    const unsigned int     num_elems_processed_per_iteration_x = is_nhwc ? 4 : max_cl_vector_width / input->element_size();
+    constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
     // Configure kernel window
-    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
-    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
 
     const bool window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, input->valid_region());
 
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, win_collapsed);
 }
 } // namespace
 
@@ -96,14 +100,19 @@
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
 
-    const unsigned int channels   = input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const unsigned int block_size = max_cl_vector_width / input->info()->element_size();
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels    = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const unsigned int vec_size    = is_nhwc ? 4 : max_cl_vector_width / input->info()->element_size();
 
     // Set kernel build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
     build_opts.add_option("-DK=" + support::cpp11::to_string(channels / num_groups));
-    build_opts.add_option("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(channels - vec_size), 0)));
+
     switch(input->info()->element_size())
     {
         case 1:
@@ -120,12 +129,33 @@
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("channel_shuffle_nchw", build_opts.options()));
+    std::string kernel_name = "channel_shuffle_" + lower_string(string_from_data_layout(data_layout));
+    ;
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
 }
 
 Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
@@ -141,14 +171,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_4D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window, lws_hint());
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 40032f9..d748745 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -40,7 +40,7 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -49,7 +49,7 @@
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
@@ -58,11 +58,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
 
     const unsigned int num_elems_read_per_iteration = 8;
 
@@ -87,7 +87,7 @@
 {
 }
 
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -105,8 +105,8 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
     build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
-    build_opts.add_option_if(num_groups > 1, "-DGROUPING");
+    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
+    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
 
@@ -130,7 +130,7 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
@@ -143,22 +143,26 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
+    bool is_collapsed     = false;
+    bool is_collapsed_out = false;
+
     Window out_window;
     out_window.use_tensor_dimensions(_output->info()->tensor_shape());
 
-    Window slice     = window.first_slice_window_3D();
-    Window slice_out = out_window.first_slice_window_3D();
+    Window collapsed     = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
+    Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
 
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
-    _kernel.setArg<cl_uint>(idx++, _output->info()->strides_in_bytes()[3]);
+    ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
 
+    Window slice     = collapsed.first_slice_window_3D();
+    Window slice_out = collapsed_out.first_slice_window_4D();
     do
     {
         // Set inputs
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice_out);
+        add_4D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice, lws_hint());
     }
-    while(window.slide_window_slice_3D(slice) && out_window.slide_window_slice_3D(slice_out));
+    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
 }

diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index e79019e..4f178c9 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp

@@ -61,6 +61,7 @@
                     num_elems_processed_per_iteration = 16;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -75,6 +76,7 @@
                     num_elems_processed_per_iteration = 8;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -84,9 +86,11 @@
             switch(output->info()->format())
             {
                 case Format::RGBA8888:
+                case Format::U8:
                     num_elems_processed_per_iteration = 16;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -143,6 +147,7 @@
                     num_elems_processed_per_iteration = 4;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -220,6 +225,7 @@
                     num_elems_read_per_iteration_x    = 16;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -235,6 +241,7 @@
                     num_elems_read_per_iteration_x    = 8;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -303,6 +310,7 @@
                     num_elems_processed_per_iteration = 16;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;
@@ -316,6 +324,7 @@
                     num_elems_processed_per_iteration = 16;
                     break;
                 default:
+                    ARM_COMPUTE_ERROR("Not supported");
                     break;
             }
             break;

diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 2da67d2..e14e5da 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp

@@ -30,21 +30,22 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList())
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -69,6 +70,64 @@
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
+
+std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
+{
+    TensorShape input_shape  = input->tensor_shape();
+    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input_shape, padding);
+
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+    // Configure window
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    // Pad on the x dimension accounting for the padding offset along the same dimension
+    AccessWindowHorizontal output_access(output, padding[0].first, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+/** Generate the string "-DPAD= @p dim @p index @p padding"
+ *
+ * @param[in] dim     The dimension index
+ * @param[in] index   Can be 0 for the start dimension and 1 for the end dimension
+ * @param[in] padding The value to pad for that index/dimension pair
+ *
+ * @return The correct concatenated string
+ */
+std::string generate_pad_string(const size_t dim, const size_t index, const size_t padding)
+{
+    return "-DPAD" + support::cpp11::to_string(dim) + support::cpp11::to_string(index) + "=" + support::cpp11::to_string(padding);
+}
+
+/** Pass the padding as build option to the kernel.
+ *
+ * @param[in]  tensor     The padded tensor
+ * @param[in]  padding    The list of the padding for each dimension
+ * @param[out] build_opts The build option to which adding the padding
+ */
+void add_padding_as_build_options(const PaddingList &padding, CLBuildOptions &build_opts)
+{
+    size_t dim = 0;
+    for(dim = 0; dim < padding.size(); dim++)
+    {
+        build_opts.add_option(generate_pad_string(dim, 0, padding[dim].first));
+        build_opts.add_option(generate_pad_string(dim, 1, padding[dim].second));
+    }
+
+    while(dim < TensorShape::num_max_dimensions)
+    {
+        build_opts.add_option(generate_pad_string(dim, 0, 0));
+        build_opts.add_option(generate_pad_string(dim, 1, 0));
+        dim++;
+    }
+}
+
 } // namespace
 
 CLCopyKernel::CLCopyKernel()
@@ -76,32 +135,68 @@
 {
 }
 
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding));
 
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    std::pair<Status, Window> win_config;
+
+    if(padding.empty())
+    {
+        // Build kernel
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_tensor", build_opts.options()));
+
+        // Configure window
+        win_config = validate_and_configure_window(input->info(), output->info());
+    }
+    else
+    {
+        // Add compile time options
+        add_padding_as_build_options(padding, build_opts);
+
+        // If we are padding in the fourth dimension the kernel needs to know the depth of the
+        // different cubes
+        if(padding.size() == 4)
+        {
+            const size_t depth = input->info()->tensor_shape()[2];
+            build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
+        }
+
+        // Build kernel
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_pad_tensor", build_opts.options()));
+
+        // Configure window
+        win_config = validate_and_configure_window_with_padding(input->info(), output->info(), padding);
+    }
+
+    // Validate and set the window
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding));
+
+    if(padding.empty())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(input->clone().get(), output->clone().get(), padding).first);
+    }
 
     return Status{};
 }
@@ -123,3 +218,4 @@
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index c6a0031..dd7d790 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -43,13 +43,21 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
+    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -93,28 +101,61 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const DataLayout data_layout = _input->info()->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
     const int out_start_x = _info.pad().first;
-    const int out_end_x   = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
+    const int out_end_x   = _output->info()->dimension(idx_w) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
     const int out_step_x  = _info.stride().first;
 
     const int out_start_y = _inner_border.top + _info.pad().second;
-    const int out_end_y   = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1;
+    const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-    Window slice_out = collapsed.first_slice_window_3D();
-    slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
-    slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-    Window slice_in = collapsed.first_slice_window_3D();
-
-    do
+    switch(data_layout)
     {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
+        case DataLayout::NCHW:
+        {
+            Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+            Window slice_out = collapsed.first_slice_window_3D();
+            slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+            slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+            Window slice_in = collapsed.first_slice_window_3D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, slice_in);
+                add_3D_tensor_argument(idx, _output, slice_out);
+                enqueue(queue, *this, slice_out);
+            }
+            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            // NOTE: not collapsing in NHWC
+            Window slice_out = window.first_slice_window_3D();
+            slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
+            slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+            Window slice_in = window.first_slice_window_3D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, slice_in);
+                add_3D_tensor_argument(idx, _output, slice_out);
+                enqueue(queue, *this, slice_out);
+            }
+            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data layout");
     }
-    while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
 }

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index a40aa28..eb561fa 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp

@@ -207,8 +207,7 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3NCHWKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                                         unsigned int        depth_multiplier,
-                                                         ActivationLayerInfo act_info)
+                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, act_info));
@@ -225,8 +224,17 @@
     _conv_pad_top  = conv_info.pad_top();
     _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
 
+    // Configure kernel window
+    std::string     kernel_name;
+    const GPUTarget gpu_target = get_target();
+
+    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
     // Set build options
     CLBuildOptions build_opts;
+    build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->tensor_shape().z()));
     build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier));
     build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
@@ -263,25 +271,16 @@
                 const float s2 = output->info()->quantization_info().scale;
                 const int   o2 = output->info()->quantization_info().offset;
 
+                build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+                build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
                 if(o1 != o2 || s1 != s2)
                 {
-                    build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
                     build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
-                    build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
                     build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
                 }
             }
         }
     }
-
-    // Configure kernel window
-    std::string     kernel_name;
-    const GPUTarget gpu_target = get_target();
-
-    auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, gpu_target, kernel_name);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set config_id for enabling LWS tuning
@@ -316,15 +315,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    // Create input window and adjust
-    Window win_in = window;
-    win_in.adjust(Window::DimX, -_conv_pad_left, true);
-    win_in.adjust(Window::DimY, -_conv_pad_top, true);
-    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    Window slice_in      = win_in.first_slice_window_3D();
-    Window slice_out     = window.first_slice_window_3D();
+    // Create input window and adjust
+    Window collapsed_in = collapsed;
+    collapsed_in.adjust(Window::DimX, -_conv_pad_left, true);
+    collapsed_in.adjust(Window::DimY, -_conv_pad_top, true);
+    collapsed_in.set_dimension_step(Window::DimX, collapsed_in.x().step() * _conv_stride_x);
+    collapsed_in.set_dimension_step(Window::DimY, collapsed_in.y().step() * _conv_stride_y);
+
+    Window slice_in      = collapsed_in.first_slice_window_3D();
+    Window slice_out     = collapsed.first_slice_window_3D();
     Window slice_weights = window.first_slice_window_3D();
     slice_weights.set_dimension_step(Window::DimX, 0);
     slice_weights.set_dimension_step(Window::DimY, 0);
@@ -347,5 +348,5 @@
 
         enqueue(queue, *this, slice_out, lws_hint());
     }
-    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+    while(collapsed.slide_window_slice_3D(slice_out) && collapsed_in.slide_window_slice_3D(slice_in));
 }

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index 50f17d5..1fce14f 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
@@ -44,14 +45,15 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                           const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && (input->data_type() == DataType::F32 || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                                                                                                     && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                                                                                                     && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
-                                                                                                     && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
-                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.enabled()) && ((input->data_type() != DataType::QASYMM8) || ((act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU)
+                                                                                                           && (act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC))),
+                                    "For QASYMM8 only logistic, relu, lower bounded relu and lower-upper bounded relu are supported"); //COMPMID-1317 add fused activation for F32
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier > 1); // COMPMID-1071 Add depth multiplier support for NHWC
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
 
     const bool is_qasymm = is_data_type_quantized_asymmetric(input->data_type());
@@ -96,7 +98,7 @@
     const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
 
     const unsigned int num_rows_processed_per_iteration = is_stride_1 ? 2 : 1;
-    const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : 2;
+    const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->element_size());
     const unsigned int num_rows_read_per_iteration      = num_rows_processed_per_iteration + 2;
     const unsigned int num_rows_written_per_iteration   = std::ceil(num_rows_processed_per_iteration / static_cast<float>(conv_info.stride().first));
 
@@ -137,8 +139,7 @@
 }
 
 void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                                         unsigned int        depth_multiplier,
-                                                         ActivationLayerInfo act_info)
+                                                         unsigned int depth_multiplier, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -158,8 +159,9 @@
     ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
     ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
 
-    const bool is_qasymm   = is_data_type_quantized_asymmetric(input->info()->data_type());
-    const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_qasymm         = is_data_type_quantized_asymmetric(input->info()->data_type());
+    const bool is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
     _input                              = input;
     _output                             = output;
@@ -168,9 +170,16 @@
     _conv_stride_y                      = conv_info.stride().second;
     _num_rows_processed_per_iteration   = is_stride_1 ? 2 : 1;
     _num_planes_processed_per_iteration = is_stride_1 ? 2 : 1;
-    _border_size                        = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
 
-    const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : 2;
+    // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
+    if(is_dot8_supported && is_qasymm)
+    {
+        _num_planes_processed_per_iteration = 1;
+    }
+
+    _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
+
+    const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size());
 
     CLBuildOptions build_opts;
     build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS");
@@ -211,16 +220,20 @@
                 const float s2 = output->info()->quantization_info().scale;
                 const int   o2 = output->info()->quantization_info().offset;
 
+                build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
+                build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
                 if(o1 != o2 || s1 != s2)
                 {
-                    build_opts.add_option("-DS1_VAL=" + float_to_string_with_full_precision(s1));
                     build_opts.add_option("-DS2_VAL=" + float_to_string_with_full_precision(s2));
-                    build_opts.add_option("-DO1_VAL=" + support::cpp11::to_string(o1));
                     build_opts.add_option("-DO2_VAL=" + support::cpp11::to_string(o2));
                 }
             }
         }
     }
+    else
+    {
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
+    }
 
     if(is_stride_1)
     {
@@ -233,11 +246,12 @@
         build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x));
         build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
     }
+    build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1,
+                             "-DDST_DEPTH=" + support::cpp11::to_string(static_cast<int>(std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)))));
 
     // Create kernel
-    const bool  is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-    std::string kernel_name       = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
-                                                                                                                          && is_stride_1 ) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+    std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
+                                                                                                                    && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -280,8 +294,12 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    Window win = window;
-    win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)), 1));
+    // Collapse window
+    Window       window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    const size_t total_batches    = _input->info()->tensor_shape().total_size_upper(3);
+
+    Window win = window_collapsed;
+    win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast<float>(_num_planes_processed_per_iteration)) * total_batches, 1));
 
     // Create input window and adjust
     Window win_in = win;
@@ -290,10 +308,10 @@
 
     ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step()));
 
-    Window slice_in  = win_in.first_slice_window_3D();
-    Window slice_out = win.first_slice_window_3D();
+    Window slice_in  = win_in.first_slice_window_4D();
+    Window slice_out = win.first_slice_window_4D();
 
-    unsigned int idx = 3 * num_arguments_per_3D_tensor();
+    unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor();
 
     if(_biases != nullptr)
     {
@@ -310,11 +328,11 @@
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
+        add_4D_tensor_argument(idx, _input, slice_in);
+        add_4D_tensor_argument(idx, _output, slice_out);
         add_3D_tensor_argument(idx, _weights, slice_out);
 
         enqueue(queue, *this, slice_out, lws_hint());
     }
-    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+    while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in));
 }

diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index f51628f..af7df14 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp

@@ -96,10 +96,12 @@
     AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
     if(_run_derivative_x && _run_derivative_y)
     {
+        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
         input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
     }
     else if(_run_derivative_x)
     {
+        // TODO(COMPMID-415) Fix x-access input bug in CL kernel instead of '+2'
         input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
     }
     else if(_run_derivative_y)

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index c8da7ac..471b320 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -93,8 +93,14 @@
 inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
                                                  DataType data_type, DataLayout data_layout)
 {
-    return gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && (kernel_size <= 5)
-           && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32) && (data_layout == DataLayout::NCHW);
+    return gpu_target_is_in(gpu_target,
+                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                            GPUTarget::G52, GPUTarget::G52LIT)
+           && (kernel_size <= 5)
+           && (conv_stride_x == 1) && (conv_stride_y == 1)
+           && (data_type == DataType::F32)
+           && (data_layout == DataLayout::NCHW);
 }
 
 inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
@@ -278,6 +284,7 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
+    // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
@@ -356,6 +363,7 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     // Output auto inizialitation if not yet initialized
+    // FIXME: input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output->info(),
                        output_shape,
                        1,
@@ -413,8 +421,7 @@
     }
     else
     {
-        bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
-
+        const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
         build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index baf6bb6..6920667 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -168,7 +168,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
@@ -176,5 +177,5 @@
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, cl::NullRange);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index 1718914..5c38568 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp

@@ -90,19 +90,21 @@
     _input  = input;
     _output = output;
 
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
-
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option_if(output->info()->num_dimensions() > 2, "-DDST_DIM1=" + support::cpp11::to_string(output->info()->dimension(1)));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("flatten", build_opts.options()));
+
     // Set config_id for enabling LWS tuning
     _config_id = "flatten";
     _config_id += "_";
@@ -131,21 +133,15 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+    Window collapsed_window = window.collapse(ICLKernel::window(), Window::DimZ);
 
-    Window out_slice = out_window.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_3D();
+    Window output_window;
+    output_window.use_tensor_dimensions(_output->info()->tensor_shape());
 
     // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, collapsed_window);
+    add_3D_tensor_argument(idx, _output, output_window);
+    enqueue(queue, *this, collapsed_window, lws_hint());
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 20e3a3a..831173d 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -33,7 +34,42 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    auto_init_if_empty(*output, *input);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 CLFloorKernel::CLFloorKernel()
     : _input(nullptr), _output(nullptr)
@@ -47,14 +83,13 @@
     // Auto initialize output
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     // Create kernel
     std::set<std::string> build_opts;
@@ -63,13 +98,17 @@
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("floor_layer", build_opts));
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+}
 
-    ICLKernel::configure_internal(win);
+Status CLFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
 }
 
 void CLFloorKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -89,3 +128,4 @@
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
new file mode 100644
index 0000000..e14b8a3
--- /dev/null
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp

@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                          float epsilon)
+{
+    ARM_COMPUTE_UNUSED(epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(conv_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(conv_weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_var);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_mean, bn_var);
+
+    unsigned int kernels_idx = get_data_layout_dimension_index(conv_weights->data_layout(), DataLayoutDimension::BATCHES);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_weights->dimension(kernels_idx) != bn_mean->dimension(0));
+
+    // Validate bias
+    if(conv_bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, conv_bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, conv_bias);
+    }
+    // Validate beta
+    if(bn_beta != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_beta);
+    }
+    // Validate gamma
+    if(bn_gamma != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, bn_gamma);
+    }
+
+    // Validate output weights
+    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(conv_weights, fused_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(conv_weights, fused_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_weights);
+    }
+    // Validate output bias
+    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(conv_weights, fused_bias);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
+    : _conv_weights(nullptr), _conv_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
+      _run_in_place_weights(false), _run_in_place_bias(false)
+{
+}
+
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
+                                               const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+                                               float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(conv_weights, bn_mean, bn_var);
+
+    _conv_weights  = conv_weights;
+    _conv_bias     = conv_bias;
+    _bn_mean       = bn_mean;
+    _bn_var        = bn_var;
+    _bn_beta       = bn_beta;
+    _bn_gamma      = bn_gamma;
+    _fused_weights = fused_weights;
+    _fused_bias    = fused_bias;
+    _epsilon       = epsilon;
+
+    _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights);
+    _run_in_place_bias    = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias);
+
+    // Auto initialize outputs
+    if(_fused_weights != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*_fused_weights->info(), *_conv_weights->info()->clone());
+        fused_weights->info()->set_valid_region(conv_weights->info()->valid_region());
+    }
+    if(_fused_bias != nullptr)
+    {
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
+        _fused_bias->info()->set_valid_region(bn_mean->info()->valid_region());
+    }
+
+    // Validate arguments
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(conv_weights->info(), bn_mean->info(), bn_var->info(),
+                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
+                                                  (conv_bias != nullptr) ? conv_bias->info() : nullptr,
+                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
+                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
+                                                  epsilon));
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration_x = 16 / conv_weights->info()->element_size();
+    const int          output_width_x                      = conv_weights->info()->tensor_shape().x();
+    const bool         multi_access_x                      = (output_width_x / num_elems_processed_per_iteration_x > 0);
+
+    Window win = calculate_max_window(*conv_weights->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX, Window::Dimension(win.x().start(),
+                                                ceil_to_multiple(win.x().end(), num_elems_processed_per_iteration_x),
+                                                num_elems_processed_per_iteration_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(conv_weights->info()->data_type()));
+    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(conv_weights->info()->data_type()));
+    build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(conv_weights->info()->dimension(2)));
+    build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - num_elems_processed_per_iteration_x, 0)));
+    build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
+    build_opts.add_option_if(_run_in_place_bias, "-DIN_PLACE_B");
+    build_opts.add_option_if(conv_bias != nullptr, "-DHAS_BIAS");
+    build_opts.add_option_if(bn_beta == nullptr, "-DUSE_DEFAULT_BETA");
+    build_opts.add_option_if(bn_gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fuse_batchnormalization_layer", build_opts.options()));
+}
+
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                                                const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                                float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon));
+    return Status{};
+}
+
+void CLFuseBatchNormalizationKernel::run(const arm_compute::Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Create window slice
+    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+    Window slice            = collapsed_window.first_slice_window_4D();
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    // Add kernel arguments
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _conv_weights, slice);
+    add_1D_tensor_argument(idx, _bn_mean, vector_slice);
+    add_1D_tensor_argument(idx, _bn_var, vector_slice);
+    if(!_run_in_place_weights)
+    {
+        add_4D_tensor_argument(idx, _fused_weights, slice);
+    }
+    if(!_run_in_place_bias)
+    {
+        add_1D_tensor_argument(idx, _fused_bias, vector_slice);
+    }
+    if(_conv_bias != nullptr)
+    {
+        add_1D_tensor_argument(idx, _conv_bias, vector_slice);
+    }
+    if(_bn_beta != nullptr)
+    {
+        add_1D_tensor_argument(idx, _bn_beta, vector_slice);
+    }
+    if(_bn_gamma != nullptr)
+    {
+        add_1D_tensor_argument(idx, _bn_gamma, vector_slice);
+    }
+    enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index ae54e77..f333c1b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -115,7 +115,7 @@
 {
 }
 
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d, bool unroll_block)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -132,6 +132,7 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+    build_opts.add_option_if(unroll_block, "-DUNROLL_BLOCK");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 9adf95f..b2fb3e0 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -57,19 +57,17 @@
 
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
 
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
     }
     else
     {
@@ -95,71 +93,127 @@
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+    }
 
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-        }
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     }
 
     return Status{};
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed,
-                                                        ElementsProcessed &num_elements_processed)
+                                                        const GEMMReshapeInfo &reshape_info, ElementsProcessed &num_elements_processed)
 {
+    const bool    is_dot8_supported                   = dot8_supported(CLKernelLibrary::get().get_device());
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
+    bool          reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
 
     Window win{};
+    Window win_out{};
     bool   window_changed = false;
 
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_input_as_3d  = false;
+        reinterpret_output_as_3d = false;
+    }
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*output);
+
+    if(reinterpret_output_as_3d)
+    {
+        // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(output->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     if(is_interleaved_transposed)
     {
+        // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
+        ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
+
         // Configure kernel window
         num_elems_processed_per_iteration_x = 4;
         num_elems_processed_per_iteration_y = 4;
 
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+        // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+        const int m          = reshape_info.m();
+        const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
+
+        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
         AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        AccessWindowStatic    input1_access(input1, 0, 0,
+                                            ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                            ceil_to_multiple(input1->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic output_access(output, 0, 0,
+                                         ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                         output->dimension(1) + bottom_pad);
 
-        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+        window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
 
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+        output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
     }
     else
     {
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-        num_elems_processed_per_iteration_x = 4;
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 5);
+        // Note: if the dot product instruction is available, the 8x2 tile has to be used
+        num_elems_processed_per_iteration_x = is_dot8_supported ? 8 : 4;
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), is_dot8_supported ? 2 : 4);
+
+        // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
+        // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
+        const int m          = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
+        const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
 
         // Configure window
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-        AccessWindowStatic    input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
-        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
+        AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+        AccessWindowStatic output_access(output, 0, 0,
+                                         ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
+                                         output->dimension(1) + bottom_pad);
 
-        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+        window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
+                         update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
 
         Coordinates coord;
         coord.set_num_dimensions(output->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+        output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
     }
 
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
 CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
+    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
 {
 }
 
@@ -167,18 +221,25 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->info()->tensor_shape() };
-    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
-    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
-
-    auto_init_if_empty(*output->info(), tensor_shape, 1, DataType::S32, QuantizationInfo());
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
+    _input0                   = input0;
+    _input1                   = input1;
+    _output                   = output;
+    _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
+    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
+
+    // In case both input and output have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
+    _slide_matrix_b                          = (_input1->info()->num_dimensions() >= num_dimensions_input0);
 
     ElementsProcessed num_elements_processed{};
 
@@ -186,15 +247,21 @@
     GPUTarget arch_target = get_arch_from_target(get_target());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
     // Create build options
-    CLBuildOptions build_opts;
     std::string    kernel_name(" ");
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+
     if(is_interleaved_transposed)
     {
         const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
@@ -205,6 +272,7 @@
         //        the correct step which is calculated as (16 * mult_transpose1xW_width) / 4)
 
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
         build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width));
         build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
 
@@ -225,6 +293,8 @@
     // Set config_id for enabling LWS tuning
     _config_id = "gemmlowp_";
     _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
     _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
@@ -242,6 +312,7 @@
                                                               input1->clone().get(),
                                                               output->clone().get(),
                                                               is_interleaved_transposed,
+                                                              reshape_info,
                                                               num_elements_processed)
                                 .first);
 
@@ -253,18 +324,40 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice          = window.first_slice_window_2D();
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
-    slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if(_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if(_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
 
     do
     {
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if(!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -273,7 +366,10 @@
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint());
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index aa954ab..d348f2c 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -46,11 +46,18 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
                           int32_t a_offset, int32_t b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
     {
@@ -62,16 +69,24 @@
     if(b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
         if(output_shape.num_dimensions() > 1)
         {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
             TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
             vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(2);
+            output_shape.collapse_from(output_batch_idx);
 
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
             if(a_offset != 0)
@@ -88,7 +103,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
                                                         int32_t a_offset, int32_t b_offset)
 {
     constexpr unsigned int num_elems_processed_per_iteration = 4;
@@ -98,20 +113,23 @@
     Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
 
     AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win,
-                                                                 mm_result_access);
+    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
 
     if(a_offset != 0)
     {
         AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win,
-                                                                     vector_sum_col_access);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
     }
     if(b_offset != 0)
     {
         AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
-        window_changed = window_changed || update_window_and_padding(win,
-                                                                     vector_sum_row_access);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -120,22 +138,30 @@
 } // namespace
 
 CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
 {
 }
 
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
+                                                   int32_t b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
                                                   vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
                                                   vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                  bias != nullptr ? bias->info() : nullptr,
                                                   a_offset, b_offset)); // NOLINT
 
     _vector_sum_col = vector_sum_col;
     _vector_sum_row = vector_sum_row;
     _mm_result      = mm_result;
+    _bias           = bias;
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->info()->num_dimensions() > 1
+                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -149,20 +175,26 @@
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    std::string kernel_name("gemmlowp_offset_contribution");
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(mm_result->info(),
                                                     vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
                                                     vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                    bias != nullptr ? bias->info() : nullptr,
                                                     a_offset, b_offset); // NOLINT
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     // Set config_id for enabling LWS tuning
-    _config_id = "gemmlowp_offset_contribution_";
+    _config_id = kernel_name + "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
@@ -170,13 +202,14 @@
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
 }
 
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
                                                     int32_t a_offset, int32_t b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
                                                               vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
                                                               vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
                                                               a_offset, b_offset)
                                 .first); // NOLINT
 
@@ -194,11 +227,17 @@
     // Set window for vector_sum_col
     Window win_vector_sum_col = slice;
     win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Set window for vector_sum_row
     Window win_vector_sum_row = slice;
     win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
     do
     {
@@ -212,7 +251,11 @@
         {
             add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
         }
-        enqueue(queue, *this, slice);
+        if(_bias != nullptr)
+        {
+            add_1D_tensor_argument(idx, _bias, biases_slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
     }
     while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000..83af0c6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp

@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+    ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output,
+                                                        int32_t a_offset, int32_t b_offset)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    bool                   window_changed                    = false;
+
+    // Auto initialize the output
+    auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, output_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+    }
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
+    : _mm_result(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
+                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                  bias != nullptr ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  a_offset, b_offset, output_stage)); // NOLINT
+
+    const int min = output_stage.gemmlowp_min_bound;
+    const int max = output_stage.gemmlowp_max_bound;
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+    _bias           = bias;
+    _output         = output;
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->info()->num_dimensions() > 1
+                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shift));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+
+    // Fuse output stage
+    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("GEMMLowpOutputStage can not be NONE!");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(mm_result->info(),
+                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                    bias != nullptr ? bias->info() : nullptr,
+                                                    output->info(),
+                                                    a_offset, b_offset); // NOLINT
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
+}
+
+Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                               const ITensorInfo *output,
+                                                               int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
+                                                              output->clone().get(),
+                                                              a_offset, b_offset)
+                                .first); // NOLINT
+
+    return Status{};
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _mm_result, slice);
+        if(_vector_sum_col != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+        }
+        if(_vector_sum_row != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+        }
+        if(_bias != nullptr)
+        {
+            add_1D_tensor_argument(idx, _bias, biases_slice);
+        }
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 875e26d..b7eff0f 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -27,9 +27,12 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -38,7 +41,8 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                          int min, int max)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
@@ -63,10 +67,13 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
 
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
@@ -75,8 +82,9 @@
 
     if(output->total_size() != 0)
     {
+        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
         AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, output_result_access);
+        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
 
         output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
@@ -100,7 +108,8 @@
 {
 }
 
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                           int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
@@ -112,20 +121,14 @@
     return Status{};
 }
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                          int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+                                                                          int min, int max)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  min,
-                                                  max));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
+                                                  min, max));
 
     _input  = input;
     _bias   = bias;
@@ -154,9 +157,11 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    // Create input window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
+    // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
     if(_bias != nullptr)
     {

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
new file mode 100644
index 0000000..b7730d5
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp

@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                          int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access);
+
+    if(output->total_size() != 0)
+    {
+        Window                 win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win_out, output_result_access);
+
+        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel()
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Status{};
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                     float multiplier, int offset,
+                                                                     int min, int max)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
+
+    _input  = input;
+    _bias   = bias;
+    _output = output;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(multiplier));
+    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(offset));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_float", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Create input window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if(_bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, _bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 5789113..621bd2b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp

@@ -63,7 +63,7 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
 
     // Configure kernel window
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index cd26cd1..225c358 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -59,7 +60,7 @@
 
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowStatic     input_access(input, 0, 0, input->dimension(0), input->dimension(1));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
@@ -115,8 +116,12 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
 
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info());
@@ -196,8 +201,8 @@
     Window slice_out = collapsed.first_slice_window_2D();
     Window slice_in  = slice_out;
 
-    slice_in.set(Window::DimY, Window::Dimension(0, 1, 1));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     do
     {

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 2f1f1bf..93332de 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp

@@ -52,7 +52,11 @@
                                                         unsigned int &num_elems_processed_per_iteration)
 {
     // Select the vector size to use (8 for Bifrost; 16 for Midgard).
-    num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) ? 8 : 16;
+    bool is_gpu_bifrost = gpu_target_is_in(gpu_target,
+                                           GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                                           GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                                           GPUTarget::G52, GPUTarget::G52LIT);
+    num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
 
     // Configure kernel window
     Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));

diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index 0c65bb4..825d7fb 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp

@@ -60,7 +60,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_UNUSED(input, output, beta);
+    ARM_COMPUTE_UNUSED(beta);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 8530ed2..c9ed776 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
@@ -48,12 +47,14 @@
 {
 using ElementsProcessed = Steps;
 
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                                 bool fp_mixed_precision)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
@@ -111,7 +112,7 @@
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 1);
+    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -217,18 +218,19 @@
 {
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                                           bool fp_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, fp_mixed_precision));
 
     _input0                   = input0;
     _input1                   = input1;
     _output                   = output;
     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1);
+    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -290,6 +292,11 @@
         else
         {
             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
+            if(fp_mixed_precision && data_type == DataType::F16)
+            {
+                // currently wider accumulator is only supported for fp16 kernels.
+                kernel_name += "_acc32";
+            }
         }
     }
     else // The input tensors have not been reshaped
@@ -305,6 +312,11 @@
             if(input0->info()->num_dimensions() != 1)
             {
                 kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+                if(fp_mixed_precision && data_type == DataType::F16)
+                {
+                    // currently wider accumulator is only supported for fp16 kernels.
+                    kernel_name += "_acc32";
+                }
             }
             else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
             {
@@ -332,6 +344,7 @@
     // Set config_id for enabling LWS tuning
     _config_id = "gemm_";
     _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+    _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
     _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
     _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
     _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
@@ -348,12 +361,12 @@
 }
 
 Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
-                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
+                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision)
 {
     // Note: num_elements_processed will be set in validate_and_configure_window()
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info, fp_mixed_precision));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
                                                               output->clone().get(),
@@ -385,7 +398,7 @@
 
     if(_reinterpret_input_as_3d)
     {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
         const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 5b29905..aa1b92a 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
new file mode 100644
index 0000000..5d100a4
--- /dev/null
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(anchors, all_anchors);
+    ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
+    if(all_anchors->total_size() > 0)
+    {
+        size_t feature_height = info.feat_height();
+        size_t feature_width  = info.feat_width();
+        size_t num_anchors    = anchors->dimension(1);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(all_anchors, anchors);
+        ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
+        ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
+    }
+    return Status{};
+}
+} // namespace
+
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
+    : _anchors(nullptr), _all_anchors(nullptr)
+{
+}
+
+void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
+
+    // Metadata
+    const size_t   num_anchors = anchors->info()->dimension(1);
+    const DataType data_type   = anchors->info()->data_type();
+    const float    width       = info.feat_width();
+    const float    height      = info.feat_height();
+
+    // Initialize the output if empty
+    const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
+    auto_init_if_empty(*all_anchors->info(), output_shape, 1, data_type);
+
+    // Set instance variables
+    _anchors     = anchors;
+    _all_anchors = all_anchors;
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DWIDTH=" + float_to_string_with_full_precision(width));
+    build_opts.add_option("-DHEIGHT=" + float_to_string_with_full_precision(height));
+    build_opts.add_option("-DSTRIDE=" + float_to_string_with_full_precision(1.f / info.spatial_scale()));
+    build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
+    build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("generate_proposals_compute_all_anchors", build_opts.options()));
+
+    // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
+    // This means we don't need to pad on the X dimension, as we know in advance how many fields
+    // compose the struct.
+    Window win = calculate_max_window(*all_anchors->info(), Steps(info.values_per_roi()));
+    ICLKernel::configure_internal(win);
+}
+
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
+    return Status{};
+}
+
+void CLComputeAllAnchorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Collapse everything on the first dimension
+    Window collapsed = window.collapse(ICLKernel::window(), Window::DimX);
+
+    // Set arguments
+    unsigned int idx = 0;
+    add_1D_tensor_argument(idx, _anchors, collapsed);
+    add_1D_tensor_argument(idx, _all_anchors, collapsed);
+
+    // Note that we don't need to loop over the slices, as we are launching exactly
+    // as many threads as all the anchors generated
+    enqueue(queue, *this, collapsed);
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index ee39c71..b56ad8d 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp

@@ -115,6 +115,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
+    // TODO (COMPMID-679): Add CLMemFill
     _output->map(queue, true);
     ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
     memset(_output->buffer(), 0, _output->size());

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 0ba0d0e..54ef23f 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -109,7 +109,7 @@
         const int yin_end   = input->dimension(1);
 
         const int xout_start = 0;
-        const int xout_end   = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0);
+        const int xout_end   = input->dimension(0) < num_elems_processed_per_iteration ? output->dimension(0) + (num_elems_processed_per_iteration - input->dimension(0)) : output->dimension(0);
         const int yout_start = 0;
         const int yout_end   = output->dimension(1);
 

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 54ed51e..97dd919 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -49,9 +49,8 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
 
     // Reduce shape on axis
@@ -62,9 +61,9 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
     }
 
     return Status{};
@@ -110,11 +109,36 @@
     build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+    std::string  kernel_name;
+    unsigned int idx = 0;
+    switch(axis)
+    {
+        case 0:
+            kernel_name = "x";
+            idx         = num_arguments_per_1D_tensor() * 3;
+            break;
+        case 1:
+            kernel_name = "y";
+            idx         = num_arguments_per_2D_tensor() * 3;
+            break;
+        case 2:
+            kernel_name = "z";
+            idx         = num_arguments_per_3D_tensor() * 3;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + kernel_name, build_opts));
 
     // Set epsilon argument
-    unsigned int idx = num_arguments_per_1D_tensor() * 3;
-    _kernel.setArg<cl_uint>(idx, _epsilon);
+    if(input->info()->data_type() == DataType::F32)
+    {
+        _kernel.setArg<cl_uint>(idx, _epsilon);
+    }
+    else
+    {
+        _kernel.setArg<cl_ushort>(idx, _epsilon);
+    }
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -137,18 +161,58 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
 
-    Window in_slice  = window.first_slice_window_1D();
-    Window sum_slice = window_sum.first_slice_window_1D();
-
-    do
+    switch(_axis)
     {
-        unsigned int idx = 0;
-        add_1D_tensor_argument(idx, _input, in_slice);
-        add_1D_tensor_argument(idx, _sum, sum_slice);
-        add_1D_tensor_argument(idx, _output, in_slice);
-        enqueue(queue, *this, in_slice);
+        case 0:
+        {
+            window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_1D();
+            Window sum_slice = window_sum.first_slice_window_1D();
+            do
+            {
+                unsigned int idx = 0;
+                add_1D_tensor_argument(idx, _input, in_slice);
+                add_1D_tensor_argument(idx, _sum, sum_slice);
+                add_1D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+        }
+        break;
+        case 1:
+        {
+            window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_2D();
+            Window sum_slice = window_sum.first_slice_window_2D();
+            do
+            {
+                unsigned int idx = 0;
+                add_2D_tensor_argument(idx, _input, in_slice);
+                add_2D_tensor_argument(idx, _sum, sum_slice);
+                add_2D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+        }
+        break;
+        case 2:
+        {
+            window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
+            Window in_slice  = window.first_slice_window_3D();
+            Window sum_slice = window_sum.first_slice_window_3D();
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, in_slice);
+                add_3D_tensor_argument(idx, _sum, sum_slice);
+                add_3D_tensor_argument(idx, _output, in_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
     }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
 }

diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 0000000..ab53897
--- /dev/null
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel()
+    : ICLKernel(), _tensor(nullptr)
+{
+}
+
+void CLMemsetKernel::configure(ICLTensor        *tensor,
+                               const PixelValue &constant_value)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    _tensor = tensor;
+
+    const DataType data_type      = tensor->info()->data_type();
+    const int      vec_size_x     = 16 / tensor->info()->element_size();
+    const int      output_width_x = tensor->info()->tensor_shape().x();
+    const bool     multi_access_x = (output_width_x / vec_size_x > 0);
+
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*tensor->info());
+    if(multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value)
+{
+    ARM_COMPUTE_UNUSED(tensor);
+    ARM_COMPUTE_UNUSED(constant_value);
+    return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    // Collapse all the batches on the third
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice     = collapsed.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _tensor, slice);
+        enqueue(queue, *this, slice);
+    }
+    while(collapsed.slide_window_slice_2D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index eb1ad68..67357da 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
@@ -61,24 +62,32 @@
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
 
-    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
-    const unsigned int norm_size             = norm_info.norm_size();
-    bool               is_norm_accross_width = norm_idx == 0;
+    const unsigned int num_elems_processed_per_iteration = 4;
 
-    const unsigned int border_width = is_norm_accross_width ? std::min(norm_size / 2, 3U) : 0;
+    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
+    const bool         is_norm_accross_width = norm_idx == 0;
+
+    const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
     const BorderSize   border_size  = BorderSize(0, border_width);
 
-    const unsigned int num_elems_processed_per_iteration = 4;
-    const unsigned int num_elems_read_per_iteration      = is_norm_accross_width ? (num_elems_processed_per_iteration + 2 * (norm_size / 2)) : num_elems_processed_per_iteration;
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
 
     // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
-    AccessWindowHorizontal input_access(input, -border_size.left, num_elems_read_per_iteration);
+    // Reads can occur within the valid region of the input
+    if(is_norm_accross_width)
+    {
+        AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+        window_changed = window_changed || update_window_and_padding(win, input_access);
+    }
+    else
+    {
+        AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, input_access);
+    }
+
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
+    window_changed = window_changed || update_window_and_padding(win, output_access);
     output_access.set_valid_region(win, input->valid_region());
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -109,14 +118,15 @@
     _input  = input;
     _output = output;
 
-    const unsigned int norm_idx     = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
-    _is_norm_across_width           = norm_idx == 0;
-    const unsigned int border_width = _is_norm_across_width ? std::min(norm_info.norm_size() / 2, 3U) : 0;
-    _border_size                    = BorderSize(0, border_width);
-
     const unsigned int num_elems_processed_per_iteration = 4;
     const bool         is_in_map_2D                      = (norm_info.type() == NormType::IN_MAP_2D);
 
+    const DataLayout   data_layout  = input->info()->data_layout();
+    const unsigned int norm_idx     = get_normalization_dimension_index(data_layout, norm_info);
+    _is_norm_across_width           = norm_idx == 0;
+    const unsigned int border_width = _is_norm_across_width ? num_elems_processed_per_iteration - 1 : 0;
+    _border_size                    = BorderSize(0, border_width);
+
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -127,6 +137,7 @@
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
 
     // Create kernel
     std::string kernel_name = _is_norm_across_width ? "normalization_layer_in_map" : "normalization_layer_cross_map";

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
new file mode 100644
index 0000000..a44507b
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp

@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    if(input->data_layout() == DataLayout::NHWC)
+    {
+        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal std_access(std, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, mean_access, std_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLNormalizePlanarYUVLayerKernel::CLNormalizePlanarYUVLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
+{
+}
+
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
+
+    _input  = input;
+    _output = output;
+    _mean   = mean;
+    _std    = std;
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int channel_idx                       = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     dt                                = input->info()->data_type();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+    build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
+
+    std::string kernel_name = "normalize_planar_yuv_layer_";
+    if(is_data_type_quantized(dt))
+    {
+        build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset)));
+        build_opts.add_option(("-DSCALE=" + support::cpp11::to_string(input->info()->quantization_info().scale)));
+        kernel_name += "q8_";
+    }
+
+    // Create kernel
+    kernel_name += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "normalize_planar_yuv_layer_";
+    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(dt));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+}
+
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get()).first);
+
+    return Status{};
+}
+
+void CLNormalizePlanarYUVLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    Window slice_in = collapsed.first_slice_window_1D();
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, slice_in);
+    add_1D_tensor_argument(idx, _std, slice_in);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index c6f0f4b..a9a2c5c 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -93,17 +93,17 @@
     build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
 
     // Run [2, 0, 1] permute
-    if(_perm[0] == 2 && _perm[1] == 0 && _perm[2] == 1)
+    if(_perm == PermutationVector{ 2U, 0U, 1U })
     {
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
     }
     // Run [1, 2, 0] permute
-    else if(_perm[0] == 1 && _perm[1] == 2 && _perm[2] == 0)
+    else if(_perm == PermutationVector{ 1U, 2U, 0U })
     {
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
     }
     // Run [3, 2, 0, 1] permute
-    else if(_perm[0] == 3 && _perm[1] == 2 && _perm[2] == 0 && _perm[3] == 1)
+    else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
     {
         _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
     }

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 4ca2ef8..286b94e 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -51,9 +51,9 @@
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -64,7 +64,7 @@
     if(output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
@@ -168,27 +168,44 @@
         data_type = "DATA_TYPE_INT";
     }
 
+    const bool is_quantized = is_data_type_quantized_asymmetric(input1->info()->data_type());
+
     // Construct kernel name
     std::string kernel_name = "pixelwise_mul";
-    kernel_name += (scale_int >= 0) ? "_int" : "_float";
+    if(!is_data_type_quantized(output->info()->data_type()))
+    {
+        kernel_name += (scale_int >= 0) ? "_int" : "_float";
+    }
 
     // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
-    build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
-    build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
-    build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
-    build_opts.emplace("-D" + data_type);
+    CLBuildOptions build_opts;
+    if(is_quantized)
+    {
+        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().offset));
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(output->info()->quantization_info().offset));
+        build_opts.add_option("-DSCALE_IN1=" + support::cpp11::to_string(input1->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale));
+        build_opts.add_option("-DSCALE_OUT=" + support::cpp11::to_string(output->info()->quantization_info().scale));
+        kernel_name += "_quantized";
+    }
+    else
+    {
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
+        build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+        build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+        build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+        build_opts.add_option("-DDATA_TYPE_RES=" + compute_type);
+    }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set scale argument
     unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters
 
-    if(scale_int >= 0)
+    if(scale_int >= 0 && !is_quantized)
     {
         _kernel.setArg(idx++, scale_int);
     }

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index df13068..bd21ea0 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -257,6 +257,8 @@
             build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
             build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
+                                     "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
             std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
             _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
             break;
@@ -315,12 +317,14 @@
     unsigned int pool_stride_y = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
 
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
     switch(_input->info()->data_layout())
     {
         case DataLayout::NCHW:
         {
-            Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-            Window slice            = window_collapsed.first_slice_window_3D();
+            Window slice = window_collapsed.first_slice_window_3D();
             do
             {
                 // Upsample input by pool size
@@ -343,21 +347,23 @@
         }
         case DataLayout::NHWC:
         {
-            Window slice = window.first_slice_window_3D();
+            const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
 
-            Window in_slice = window.first_slice_window_3D();
+            Window slice    = window_collapsed.first_slice_window_4D();
+            Window in_slice = window_collapsed.first_slice_window_4D();
             in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
+            in_slice.set(3, Window::Dimension(0, total_batches, 1));
             do
             {
                 // Set inputs
                 unsigned int idx = 0;
-                add_3D_tensor_argument(idx, _input, in_slice);
-                add_3D_tensor_argument(idx, _output, slice);
+                add_4D_tensor_argument(idx, _input, in_slice);
+                add_4D_tensor_argument(idx, _output, slice);
                 enqueue(queue, *this, slice, lws_hint());
             }
-            while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(in_slice));
+            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
             break;
         }
         default:

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
new file mode 100644
index 0000000..63e745e
--- /dev/null
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp

@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+    // Check variances
+    const int var_size = info.variances().size();
+    if(var_size > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
+        for(int i = 0; i < var_size; ++i)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
+        }
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
+
+    if(!info.max_sizes().empty())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+    }
+
+    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+    }
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+{
+    ARM_COMPUTE_UNUSED(input2);
+    // Output tensor auto initialization if not yet initialized
+    TensorShape output_shape = compute_prior_box_shape(*input1, info);
+    auto_init_if_empty(*output, output_shape, 1, input1->data_type());
+
+    Window win{};
+    bool   window_changed = false;
+
+    switch(input1->data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+
+            win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, output_access);
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            win = calculate_max_window(*output, Steps());
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    };
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLPriorBoxLayerKernel::CLPriorBoxLayerKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info(), _num_priors(), _min(), _max(), _aspect_ratios()
+{
+}
+
+void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    _input1        = input1;
+    _input2        = input2;
+    _output        = output;
+    _info          = info;
+    _min           = min;
+    _max           = max;
+    _aspect_ratios = aspect_ratios;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), info));
+
+    // Calculate number of aspect ratios
+    _num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+
+    const DataLayout data_layout = input1->info()->data_layout();
+
+    const int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const int layer_width  = input1->info()->dimension(width_idx);
+    const int layer_height = input1->info()->dimension(height_idx);
+
+    int img_width  = info.img_size().x;
+    int img_height = info.img_size().y;
+    if(img_width == 0 || img_height == 0)
+    {
+        img_width  = input2->info()->dimension(width_idx);
+        img_height = input2->info()->dimension(height_idx);
+    }
+
+    float step_x = info.steps()[0];
+    float step_y = info.steps()[0];
+    if(step_x == 0.f || step_y == 0.f)
+    {
+        step_x = static_cast<float>(img_width) / layer_width;
+        step_y = static_cast<float>(img_height) / layer_height;
+    }
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(img_width));
+    build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(img_height));
+    build_opts.add_option("-DLAYER_WIDTH=" + support::cpp11::to_string(layer_width));
+    build_opts.add_option("-DLAYER_HEIGHT=" + support::cpp11::to_string(layer_height));
+    build_opts.add_option("-DSTEP_X=" + support::cpp11::to_string(step_x));
+    build_opts.add_option("-DSTEP_Y=" + support::cpp11::to_string(step_y));
+    build_opts.add_option("-DNUM_PRIORS=" + support::cpp11::to_string(_num_priors));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
+    build_opts.add_option_if(info.clip(), "-DIN_PLACE");
+
+    if(info.variances().size() > 1)
+    {
+        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        {
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+        }
+    }
+    else
+    {
+        for(unsigned int i = 0; i < 4; ++i)
+        {
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+        }
+    }
+
+    unsigned int idx = 0;
+    // Create kernel
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            idx     = num_arguments_per_2D_tensor();
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nchw", build_opts.options()));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            idx     = num_arguments_per_3D_tensor();
+            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("prior_box_layer_nhwc", build_opts.options()));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    _kernel.setArg(idx++, *_min);
+    _kernel.setArg(idx++, *_max);
+    _kernel.setArg(idx++, *_aspect_ratios);
+    _kernel.setArg<unsigned int>(idx++, info.min_sizes().size());
+    _kernel.setArg<unsigned int>(idx++, info.max_sizes().size());
+    _kernel.setArg<unsigned int>(idx++, info.aspect_ratios().size());
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info, _num_priors);
+
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
+    const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
+                                .first);
+
+    return Status{};
+}
+
+void CLPriorBoxLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
+    if(!_info.max_sizes().empty())
+    {
+        queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
+    }
+
+    switch(_input1->info()->data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            Window slice = window.first_slice_window_2D();
+            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+
+            unsigned int idx = 0;
+            add_2D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            Window slice = window.first_slice_window_3D();
+            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * _num_priors));
+            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
+
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, _output, slice);
+            enqueue(queue, *this, slice);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
new file mode 100644
index 0000000..325eeb2
--- /dev/null
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, rois);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
+    ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->dimension(2), rois->dimension(1));
+    auto_init_if_empty((*output), output_shape, 1, input->data_type());
+
+    // Configure kernel window
+    const unsigned int num_elems_processed_per_iteration = 1;
+    Window             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input_access(input, input->valid_region().start(0), num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLROIAlignLayerKernel::CLROIAlignLayerKernel()
+    : _input(nullptr), _output(nullptr), _rois(nullptr), _pool_info(0, 0, 0.f)
+{
+}
+
+void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    _input     = input;
+    _output    = output;
+    _rois      = rois;
+    _pool_info = pool_info;
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(Window::DimX)));
+    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(Window::DimY)));
+    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(Window::DimZ)));
+    build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
+    build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
+    build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+
+    // Create kernel
+    std::string kernel_name = "roi_align_layer";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    ICLKernel::configure_internal(win_config.second);
+}
+
+Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
+    return Status{};
+}
+
+void CLROIAlignLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window slice      = window.first_slice_window_3D();
+    Window slice_rois = slice;
+    // Parallelize spatially and across the fourth dimension of the output tensor (also across ROITensor)
+    slice_rois.set_dimension_step(Window::DimX, _rois->info()->dimension(0));
+    slice.set(Window::DimZ, window[3]);
+
+    // Set arguments
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice);
+    add_2D_tensor_argument(idx, _rois, slice_rois);
+    add_3D_tensor_argument(idx, _output, slice);
+    add_argument<cl_uint>(idx, _input->info()->strides_in_bytes()[3]);
+    add_argument<cl_uint>(idx, _output->info()->strides_in_bytes()[3]);
+
+    enqueue(queue, *this, slice);
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index bf36ae2..ef46325 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -39,24 +39,22 @@
 
 namespace
 {
-// OpenCL kernel requires input width to be a power of 2.
+// OpenCL kernel requires input width to be a power of 2 for x-axis.
 constexpr unsigned int border_val = 64;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
-    ARM_COMPUTE_UNUSED(op);
-
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
+    ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
     }
 
     return Status{};
@@ -69,16 +67,44 @@
     output_shape.set(axis, 1);
     auto_init_if_empty(*output, output_shape, 1, input->data_type());
 
-    const unsigned int num_elems_processed_per_iteration = 16;
+    const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
+    Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool               window_changed                    = false;
 
-    Window             win          = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    const unsigned int border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
-
-    AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
-    AccessWindowHorizontal output_access(output, 0, 1);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, output->valid_region());
+    switch(axis)
+    {
+        case 0:
+        {
+            if(is_data_type_quantized(input->data_type()))
+            {
+                AccessWindowHorizontal input_access(input, 0, input->dimension(0));
+                AccessWindowHorizontal output_access(output, 0, 1);
+                window_changed = update_window_and_padding(win, input_access, output_access);
+                output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            }
+            else
+            {
+                const unsigned int     border_width = ((input->dimension(0) % border_val) != 0) ? border_val - input->dimension(0) % border_val : 0;
+                AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
+                AccessWindowHorizontal output_access(output, 0, 1);
+                window_changed = update_window_and_padding(win, input_access, output_access);
+                output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            }
+        }
+        break;
+        case 1:
+        case 2:
+        case 3:
+        {
+            AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, input_access, output_access);
+            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
 
@@ -96,46 +122,86 @@
     return _border_size;
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-    const unsigned int num_elems_processed_per_iteration = 16;
-    const unsigned int width_leftover                    = input->info()->dimension(0) % border_val;
-    const unsigned int border_width                      = (width_leftover != 0) ? border_val - width_leftover : 0;
-    const unsigned int num_of_threads                    = ((input->info()->dimension(0) + border_width) / 16);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op, width));
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
-    // Set the number of WG based on the input size. If input width is < 128
-    // we can use fewer threads than 8.
-    cl::NDRange lws_hint = cl::NDRange(std::min(8U, num_of_threads));
-    _border_size         = BorderSize(0, border_width, 0, 0);
-
     // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
+    CLBuildOptions build_opts;
+    std::string    data_type_promoted = get_cl_type_from_data_type(input->info()->data_type());
+    if(is_data_type_quantized(input->info()->data_type()) && axis != 0)
+    {
+        data_type_promoted = "uint";
+    }
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+    build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
 
     switch(op)
     {
         case ReductionOperation::SUM_SQUARE:
-            build_opts.emplace(("-DOPERATION=square_sum"));
+            build_opts.add_option(("-DOPERATION=square_sum"));
             break;
         case ReductionOperation::SUM:
-            build_opts.emplace(("-DOPERATION=sum"));
+        case ReductionOperation::MEAN_SUM:
+            build_opts.add_option(("-DOPERATION=sum"));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction operation");
     }
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation", build_opts));
+    cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
+    std::string kernel_axis_name;
+    switch(axis)
+    {
+        case 0:
+        {
+            if(!is_data_type_quantized(input->info()->data_type()))
+            {
+                build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
+                const unsigned int width_leftover = input->info()->dimension(0) % border_val;
+                const unsigned int border_width   = (width_leftover != 0) ? border_val - width_leftover : 0;
+                const unsigned int num_of_threads = ((input->info()->dimension(0) + border_width) / 16);
+                kernel_axis_name                  = "x";
+
+                // Set the number of WG based on the input size. If input width is < 128
+                // we can use fewer threads than 8.
+                lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
+                _border_size = BorderSize(0, border_width, 0, 0);
+            }
+            else
+            {
+                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+                kernel_axis_name = "quantized_x";
+            }
+        }
+        break;
+        case 1:
+            build_opts.add_option("-DHEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+            kernel_axis_name = "y";
+            break;
+        case 2:
+            build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+            kernel_axis_name = "z";
+            break;
+        case 3:
+            build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+            build_opts.add_option("-DBATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
+            kernel_axis_name = "w";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reduction_operation_" + kernel_axis_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
@@ -145,9 +211,9 @@
     ICLKernel::configure_internal(std::get<1>(win_config), lws_hint);
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op, width));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), axis)));
 
     return Status{};
@@ -158,28 +224,113 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    // Set out window
-    Window out_window(window);
-    out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    // Get first input and output slices
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    // Reshape window
-    const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
-
-    // Set local sums buffer
-    unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
-    _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
-
-    do
+    switch(_reduction_axis)
     {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
+        case 0:
+        {
+            // We use parallel reduction only in non quantized types
+            if(!is_data_type_quantized(_input->info()->data_type()))
+            {
+                // Set out window
+                Window out_window(window);
+                out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+                // Get first input and output slices
+                Window in_slice  = window.first_slice_window_2D();
+                Window out_slice = out_window.first_slice_window_2D();
+
+                // Reshape window
+                const unsigned int border_width = ((in_slice.x().end() % border_val) != 0) ? border_val - in_slice.x().end() % border_val : 0;
+                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
+
+                // Set local sums buffer
+                unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
+                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
+
+                do
+                {
+                    unsigned int idx = 0;
+                    add_2D_tensor_argument(idx, _input, in_slice);
+                    add_2D_tensor_argument(idx, _output, out_slice);
+                    enqueue(queue, *this, in_slice, lws_hint());
+                }
+                while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            }
+            else
+            {
+                // Get first input and output slices
+                Window window_in{ window };
+                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+
+                Window in_slice  = window.first_slice_window_1D();
+                Window out_slice = window.first_slice_window_1D();
+
+                do
+                {
+                    unsigned int idx = 0;
+                    add_1D_tensor_argument(idx, _input, in_slice);
+                    add_1D_tensor_argument(idx, _output, out_slice);
+                    enqueue(queue, *this, in_slice);
+                }
+                while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+            }
+        }
+        break;
+        case 1:
+        {
+            // Get first input and output slices
+            Window window_in{ window };
+            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window in_slice  = window_in.first_slice_window_2D();
+            Window out_slice = window.first_slice_window_2D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_2D_tensor_argument(idx, _input, in_slice);
+                add_2D_tensor_argument(idx, _output, out_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+        }
+        break;
+        case 2:
+        {
+            // Get first input and output slices
+            Window window_in{ window };
+            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window in_slice  = window_in.first_slice_window_3D();
+            Window out_slice = window.first_slice_window_3D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, _input, in_slice);
+                add_3D_tensor_argument(idx, _output, out_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+        }
+        break;
+        case 3:
+        {
+            // Get first input and output slices
+            Window window_in{ window };
+            window_in.set(3, Window::Dimension(0, 1, 1));
+            Window in_slice  = window_in.first_slice_window_4D();
+            Window out_slice = window.first_slice_window_4D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_4D_tensor_argument(idx, _input, in_slice);
+                add_4D_tensor_argument(idx, _output, out_slice);
+                enqueue(queue, *this, in_slice);
+            }
+            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
     }
-    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
 }

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
new file mode 100644
index 0000000..7891844
--- /dev/null
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLReorgLayerKernel::CLReorgLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLReorgLayerKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t stride)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+    _input  = input;
+    _output = output;
+
+    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
+    build_opts.add_option("-DSTRIDE=" + support::cpp11::to_string(stride));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure window
+    // auto inizialize the output tensor if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+
+    Window win = calculate_max_window(*output->info(), Steps());
+
+    // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    ICLKernel::configure_internal(win);
+
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += string_from_data_type(input->info()->data_type());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(stride);
+}
+
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+
+    return Status{};
+}
+
+void CLReorgLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index c7efa9a..aa1339d 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp

@@ -37,8 +37,28 @@
 
 #include <string>
 
+/** [CLReshapeLayerKernel Kernel] **/
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+
+    return Status{};
+}
+
+} // namespace
+
 CLReshapeLayerKernel::CLReshapeLayerKernel()
     : _input(nullptr), _output(nullptr)
 {
@@ -46,20 +66,12 @@
 
 void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                  DataType::U16, DataType::S16,
-                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
     // Create kernel
     std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()) };
     _kernel                          = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_layer", build_opts));
@@ -84,17 +96,20 @@
     _kernel.setArg<cl_int2>(idx++, output_shape);
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input->info());
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
+    // Set the output valid region
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
     ICLKernel::configure_internal(win);
 }
 
+Status CLReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+    return Status{};
+}
+
 void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -109,3 +124,4 @@
     add_3D_tensor_argument(idx, _output, window_collapsed);
     enqueue(queue, *this, slice);
 }
+/** [CLReshapeLayerKernel Kernel] **/

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index d56d6f7..ce6c016 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -62,7 +62,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
@@ -170,6 +170,8 @@
     float hr = 0.f;
     std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info());
 
+    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
+
     DataLayout data_layout = input->info()->data_layout();
     const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -200,11 +202,18 @@
     build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
     build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
     build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+    if(call_quantized_kernel)
+    {
+        build_opts.add_option("-DSCALE=" + support::cpp11::to_string(input->info()->quantization_info().scale));
+        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().offset));
+    }
 
     std::string interpolation_name = string_from_interpolation_policy(policy);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name + "_" + lower_string(string_from_data_layout(data_layout));
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+    std::string kernel_name = "scale_" + interpolation_name;
+    kernel_name += call_quantized_kernel ? "_quantized_" : "_";
+    kernel_name += lower_string(string_from_data_layout(data_layout));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
 

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
new file mode 100644
index 0000000..d488631
--- /dev/null
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp

@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *padddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                 const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const DataLayout data_layout = input->data_layout();
+        const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+        const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] / block_shape_x != (output->tensor_shape()[idx_width] - padding_left.x() - padding_right.y()));
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] / block_shape_y != (output->tensor_shape()[idx_height] - padding_left.x() - padding_right.y()));
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLSpaceToBatchLayerKernel::CLSpaceToBatchLayerKernel()
+    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr)
+{
+}
+
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+
+    _input       = input;
+    _block_shape = block_shape;
+    _paddings    = paddings;
+    _output      = output;
+
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+    ICLKernel::configure_internal(win);
+}
+
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                          ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+
+    _input  = input;
+    _output = output;
+
+    const DataLayout data_layout = input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
+    build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
+    build_opts.add_option("-DPAD_LEFT_X=" + support::cpp11::to_string(padding_left.x()));
+    build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
+    build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
+    build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+    ICLKernel::configure_internal(win);
+}
+
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
+    return Status{};
+}
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                           const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    return Status{};
+}
+
+void CLSpaceToBatchLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice_out = window.first_slice_window_3D();
+
+    Window slice_in = window.first_slice_window_4D();
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_in.set(3, Window::Dimension(0, 0, 0));
+
+    Window vector_slice = window.first_slice_window_1D();
+    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    Window padding_slice = window.first_slice_window_2D();
+    padding_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+    padding_slice.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    int batch_id = 0;
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, slice_in);
+        if(_paddings != nullptr && _block_shape != nullptr)
+        {
+            add_2D_tensor_argument(idx, _paddings, padding_slice);
+            add_1D_tensor_argument(idx, _block_shape, vector_slice);
+        }
+        add_argument(idx, batch_id);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+        ++batch_id;
+    }
+    while(window.slide_window_slice_3D(slice_out));
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
new file mode 100644
index 0000000..2d2ba10
--- /dev/null
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp

@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
+    {
+        return i == 0;
+    }));
+
+    // Get expected output shape
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+                                                                                                          starts, ends, strides,
+                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
+
+    // Checks output if configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape() != exp_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    // Output tensor auto initialization if not yet initialized
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
+                                                                                                      starts, ends, strides,
+                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+
+    // Create window
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+CLStridedSliceKernel::CLStridedSliceKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+
+    _input  = input;
+    _output = output;
+
+    const TensorShape &input_shape = input->info()->tensor_shape();
+
+    const Coordinates final_strides = arm_compute::helpers::tensor_transform::strided_slice_strides(input_shape, strides);
+    const Coordinates starts_abs    = arm_compute::helpers::tensor_transform::strided_slice_absolute_start_coords(input_shape, starts, final_strides, begin_mask);
+    const Coordinates ends_abs      = arm_compute::helpers::tensor_transform::strided_slice_absolute_end_coords(input_shape, starts_abs, ends, final_strides, end_mask, shrink_axis_mask);
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
+    const int  vec_size_x     = 16 / input->info()->element_size();
+    const int  output_width_x = output->info()->tensor_shape().x();
+    const bool multi_access_x = (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
+
+    // Update window if needed
+    if(multi_access_x)
+    {
+        Window &updated_window = std::get<1>(win_config);
+        updated_window.set(Window::DimX,
+                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    {
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+    }
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
+                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
+                                  "-DSRC_DEPTH=1");
+    build_opts.add_option_if_else(_output->info()->num_dimensions() > 2,
+                                  "-DDST_DEPTH=" + support::cpp11::to_string(_output->info()->tensor_shape().z()),
+                                  "-DDST_DEPTH=1");
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("strided_slice", build_opts.options()));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "strided_slice";
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    {
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(input->info()->dimension(i));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(starts_abs[i]);
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(ends_abs[i]);
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(final_strides[i]);
+    }
+}
+
+Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
+                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
+                                .first);
+
+    return Status{};
+}
+
+void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window_collapsed.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index 94e15f3..ccf22ea 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
@@ -86,8 +87,7 @@
 
     if(output->total_size() != 0)
     {
-        AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration), ceil_to_multiple(output->dimension(1),
-                                         num_elems_processed_per_iteration));
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
 
         window_changed = window_changed || update_window_and_padding(win, output_access);
 

diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
new file mode 100644
index 0000000..ee3fa11
--- /dev/null
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp

@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+namespace arm_compute
+{
+CLUpsampleLayerKernel::CLUpsampleLayerKernel()
+    : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+{
+}
+
+Status CLUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(upsampling_policy);
+
+    DataLayout data_layout = input->data_layout();
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+
+    return Status{};
+}
+
+void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(upsampling_policy);
+
+    _input                                     = input;
+    _output                                    = output;
+    _info                                      = info;
+    _num_elems_processed_per_iteration_input_x = 1;
+
+    const DataLayout data_layout = input->info()->data_layout();
+
+    TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    output->info()->set_data_layout(data_layout);
+
+    unsigned int num_elems_processed_per_iteration_x = 16;
+    const int    output_width_x                      = output->info()->dimension(0);
+    const bool   multi_access_x                      = ((output_width_x / num_elems_processed_per_iteration_x) > 0);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CLUpsampleLayerKernel::validate(input->info(), output->info(), info, upsampling_policy));
+
+    Window win{};
+
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            win = calculate_max_window(*output->info());
+            win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.y()));
+            if(multi_access_x)
+            {
+                _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x / info.x();
+                win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), num_elems_processed_per_iteration_x), num_elems_processed_per_iteration_x));
+            }
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            win = calculate_max_window(*output->info());
+            win.set(Window::DimY, Window::Dimension(win.y().start(), win.y().end(), info.x()));
+            win.set(Window::DimZ, Window::Dimension(win.z().start(), win.z().end(), info.y()));
+            if(multi_access_x)
+            {
+                _num_elems_processed_per_iteration_input_x = num_elems_processed_per_iteration_x;
+                win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(),
+                                                                                          num_elems_processed_per_iteration_x),
+                                                        num_elems_processed_per_iteration_x));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_IN=" + support::cpp11::to_string(_num_elems_processed_per_iteration_input_x));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_IN=" + support::cpp11::to_string(std::max<int>(_input->info()->dimension(0) - _num_elems_processed_per_iteration_input_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_OUT=" + support::cpp11::to_string(std::max<int>(output_width_x - num_elems_processed_per_iteration_x, 0)));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("upsample_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
+
+    ICLKernel::configure_internal(win);
+}
+
+void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed_window = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice_out        = collapsed_window.first_slice_window_3D();
+    Window slice_in         = collapsed_window.first_slice_window_3D();
+
+    DataLayout data_layout = _input->info()->data_layout();
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+            slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
+            slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+            break;
+        case DataLayout::NHWC:
+            slice_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
+            slice_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice_in);
+        add_3D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(collapsed_window.slide_window_slice_3D(slice_out) && collapsed_window.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000..b0d27cb
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp

@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    // The window needs to be based on the output
+    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input2->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) > output->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x2_";
+    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+}
+
+void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input1, slice);
+        add_4D_tensor_argument(idx, _input2, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000..75aef9c
--- /dev/null
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp

@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output)
+{
+    // The window needs to be based on the output
+    Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration, input1->dimension(1));
+    AccessWindowStatic input2_access(input2, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input2->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input2->dimension(1));
+    AccessWindowStatic input3_access(input3, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input3->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input3->dimension(1));
+    AccessWindowStatic input4_access(input4, -num_elems_processed_per_iteration, 0, ceil_to_multiple(input4->dimension(0), num_elems_processed_per_iteration) + num_elems_processed_per_iteration,
+                                     input4->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input1_access, input2_access, input3_access, input4_access, output_access);
+
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win_collapsed);
+}
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::F16, DataType::U32,
+                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, input3, input4, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) + input2->dimension(0) + input3->dimension(0) + input4->dimension(0) > output->dimension(0));
+
+    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input2->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input3->dimension(i) != output->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(input4->dimension(i) != output->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
+    : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
+{
+}
+
+Status CLWidthConcatenate4TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, input3, input4, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), input3->clone().get(), input4->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
+
+    _input1 = input1;
+    _input2 = input2;
+    _input3 = input3;
+    _input4 = input4;
+    _output = output;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
+    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x4_";
+    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input3->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input3->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input4->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input4->info()->dimension(1));
+}
+
+void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, _input1, slice);
+        add_4D_tensor_argument(idx, _input2, slice);
+        add_4D_tensor_argument(idx, _input3, slice);
+        add_4D_tensor_argument(idx, _input4, slice);
+        add_4D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, window, lws_hint());
+    }
+    while(window.slide_window_slice_4D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index e5ab8d2..c51c579 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp

@@ -53,8 +53,10 @@
     AccessWindowHorizontal output_access(output, width_offset, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
 
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, win_collapsed);
 }
 Status validate_arguments(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output)
 {
@@ -69,7 +71,7 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     return Status{};
 }
@@ -103,6 +105,7 @@
     build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options()));
@@ -119,14 +122,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, window);
+    add_4D_tensor_argument(idx, _output, window);
+    enqueue(queue, *this, window);
 }

diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 818638c..55cc465 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -45,7 +46,8 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
 
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
@@ -109,9 +111,9 @@
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
-
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
 

diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index c4e472a..1c31ceb 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
@@ -41,7 +42,8 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
 
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -114,6 +116,7 @@
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
+    const DataLayout    data_layout      = input->info()->data_layout();
 
     const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -122,7 +125,7 @@
     const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
     const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
 
-    if(input->info()->data_layout() == DataLayout::NCHW)
+    if(data_layout == DataLayout::NCHW)
     {
         // Check if we need to extend the right or bottom border
         const unsigned int extra_border_right  = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -152,6 +155,7 @@
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(output->info()->dimension(1)));
+    const size_t total_batches = input->info()->tensor_shape().total_size_upper(3);
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
@@ -159,14 +163,19 @@
     build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
     build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
     build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
-
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if(data_layout == DataLayout::NHWC)
     {
+        build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
         build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
         build_opts.add_option("-DSRC_DIM_2=" + support::cpp11::to_string(_input->info()->dimension(2)));
     }
+    else
+    {
+        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
+    }
 
     // Create kernel
     std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
@@ -175,7 +184,7 @@
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (input->info()->data_layout() == DataLayout::NCHW))
+    if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
     {
         _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -183,7 +192,7 @@
     // Append stepz and data layout
     kernel_name += "_stepz";
     kernel_name += support::cpp11::to_string(_step_z);
-    kernel_name += "_" + lower_string(string_from_data_layout(input->info()->data_layout()));
+    kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
@@ -220,17 +229,30 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const size_t idx_w = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataLayout data_layout   = _input->info()->data_layout();
+    const size_t     idx_w         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t     idx_c         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const size_t     total_batches = window.shape().total_size_upper(3);
 
-    Window slice = window.first_slice_window_3D();
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    Window slice = window_collapsed.first_slice_window_3D();
     slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
     slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
+    if(data_layout == DataLayout::NHWC)
+    {
+        slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
+    }
 
     ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
     slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
 
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+
     do
     {
         unsigned int idx = 0;
@@ -239,5 +261,5 @@
 
         enqueue(queue, *this, slice, lws_hint());
     }
-    while(window.slide_window_slice_3D(slice));
+    while(window_collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index fa42596..7f1afe0 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -47,7 +48,8 @@
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
 
     ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
 
@@ -155,6 +157,7 @@
                                                                 kernel_size,
                                                                 output_tile_size,
                                                                 conv_info);
+    const size_t total_batches = output->info()->tensor_shape().total_size_upper(3);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -162,6 +165,8 @@
     build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
     build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
     build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(_input->info()->dimension(2)));
     build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
     build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
 
@@ -203,8 +208,11 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
     // Get initial windows
-    Window slice = window.first_slice_window_3D();
+    Window slice = window_collapsed.first_slice_window_4D();
     slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
     // Setup output slice
@@ -214,7 +222,7 @@
 
     if(_bias != nullptr)
     {
-        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+        unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
         Window       slice_biases;
         slice_biases.use_tensor_dimensions(_bias->info()->tensor_shape());
         add_1D_tensor_argument(idx1, _bias, slice_biases);
@@ -222,15 +230,15 @@
 
     if(_output->info()->data_layout() == DataLayout::NHWC)
     {
-        unsigned int idx2 = 2 * num_arguments_per_3D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
+        unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((_bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2, static_cast<int>(_output->info()->total_size() - _output->info()->strides_in_bytes().y()));
     }
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice_out);
+        add_4D_tensor_argument(idx, _input, slice);
+        add_4D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));

diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
new file mode 100644
index 0000000..7d9dbd4
--- /dev/null
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp

@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % (num_classes + 5)) != 0);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input);
+    }
+
+    const bool         is_nchw                           = input->data_layout() == DataLayout::NCHW;
+    const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->element_size() : 1;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLYOLOLayerKernel::CLYOLOLayerKernel()
+    : _input(nullptr), _output(nullptr), _run_in_place(false)
+{
+}
+
+void CLYOLOLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _run_in_place = (output == nullptr) || (output == input);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info, num_classes));
+
+    const bool         is_nchw                           = input->info()->data_layout() == DataLayout::NCHW;
+    const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->info()->element_size() : 1;
+    const DataType     dt                                = input->info()->data_type();
+    float              a_const                           = act_info.a();
+    float              b_const                           = act_info.b();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(act_info.activation())));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option("-DSELECT_DATA_TYPE=" + get_cl_select_type_from_data_type(dt));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(a_const));
+    build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(b_const));
+    build_opts.add_option("-DNUM_CLASSES=" + support::cpp11::to_string(num_classes));
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+
+    // Create kernel
+    std::string kernel_name = std::string("yolo_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Make sure _kernel is initialized before calling the parent's configure
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "yolo_layer_";
+    _config_id += lower_string(string_from_data_type(dt));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+}
+
+Status CLYOLOLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    const bool run_in_place = (output == nullptr) || (output == input);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info, num_classes));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get()).first);
+
+    return Status{};
+}
+
+void CLYOLOLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        if(!_run_in_place)
+        {
+            add_3D_tensor_argument(idx, _output, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
index 9d18a9c..01fb016 100644
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ b/src/core/CPP/ICPPSimpleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,26 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
+                                                        bool border_undefined, const arm_compute::BorderSize &border_size)
+{
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, input->valid_region(), border_undefined, border_size);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 ICPPSimpleKernel::ICPPSimpleKernel()
     : _input{ nullptr }, _output{ nullptr }
@@ -40,14 +59,16 @@
     _output = output;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
-
-    ICPPKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, border_undefined, border_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
 }
+
+Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
+                                  bool border_undefined, const arm_compute::BorderSize &border_size)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, border_undefined, border_size).first);
+    return Status{};
+}
+
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
new file mode 100644
index 0000000..06a0551
--- /dev/null
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp

@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+{
+    std::vector<int> keep;
+    const int        proposals_width = proposals->info()->dimension(1);
+
+    std::vector<T> x1(proposals_width);
+    std::vector<T> y1(proposals_width);
+    std::vector<T> x2(proposals_width);
+    std::vector<T> y2(proposals_width);
+    std::vector<T> areas(proposals_width);
+
+    for(int i = 0; i < proposals_width; ++i)
+    {
+        x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+        y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+        x2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+        y2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+        areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+    }
+
+    // Note: Soft NMS scores have already been initialized with input scores
+
+    while(!inds.empty())
+    {
+        // Find proposal with max score among remaining proposals
+        int max_pos = 0;
+        for(unsigned int i = 1; i < inds.size(); ++i)
+        {
+            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            {
+                max_pos = i;
+            }
+        }
+        int element = inds.at(max_pos);
+        keep.push_back(element);
+        std::swap(inds.at(0), inds.at(max_pos));
+
+        // Remove first element and compute IoU of the remaining boxes with identified max box
+        inds.erase(inds.begin());
+
+        std::vector<int> sorted_indices_temp;
+        for(auto idx : inds)
+        {
+            const auto xx1 = std::max(x1[idx], x1[element]);
+            const auto yy1 = std::max(y1[idx], y1[element]);
+            const auto xx2 = std::min(x2[idx], x2[element]);
+            const auto yy2 = std::min(y2[idx], y2[element]);
+
+            const auto w     = std::max((xx2 - xx1 + 1.f), 0.f);
+            const auto h     = std::max((yy2 - yy1 + 1.f), 0.f);
+            const auto inter = w * h;
+            const auto ovr   = inter / (areas[element] + areas[idx] - inter);
+
+            // Update scores based on computed IoU, overlap threshold and NMS method
+            T weight;
+            switch(info.soft_nms_method())
+            {
+                case NMSType::LINEAR:
+                    weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
+                    break;
+                case NMSType::GAUSSIAN: // Gaussian
+                    weight = std::exp(-1.f * ovr * ovr / info.soft_nms_sigma());
+                    break;
+                case NMSType::ORIGINAL: // Original NMS
+                    weight = (ovr > info.nms()) ? 0.f : 1.f;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+
+            // Discard boxes with new scores below min threshold and update pending indices
+            scores_in[class_id][idx] *= weight;
+            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            {
+                sorted_indices_temp.push_back(idx);
+            }
+        }
+        inds = sorted_indices_temp;
+    }
+
+    return keep;
+}
+
+template <typename T>
+std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+{
+    std::vector<int> keep;
+
+    const int proposals_width = proposals->info()->dimension(1);
+
+    std::vector<T> x1(proposals_width);
+    std::vector<T> y1(proposals_width);
+    std::vector<T> x2(proposals_width);
+    std::vector<T> y2(proposals_width);
+    std::vector<T> areas(proposals_width);
+
+    for(int i = 0; i < proposals_width; ++i)
+    {
+        x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+        y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+        x2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+        y2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+        areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+    }
+
+    while(!sorted_indices.empty())
+    {
+        int i = sorted_indices.at(0);
+        keep.push_back(i);
+
+        std::vector<int> sorted_indices_temp = sorted_indices;
+        std::vector<int> new_indices;
+        sorted_indices_temp.erase(sorted_indices_temp.begin());
+
+        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        {
+            const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
+            const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
+            const float xx2 = std::min(x2[sorted_indices_temp.at(j)], x2[i]);
+            const float yy2 = std::min(y2[sorted_indices_temp.at(j)], y2[i]);
+
+            const float w     = std::max((xx2 - xx1 + 1.f), 0.f);
+            const float h     = std::max((yy2 - yy1 + 1.f), 0.f);
+            const float inter = w * h;
+            const float ovr   = inter / (areas[i] + areas[sorted_indices_temp.at(j)] - inter);
+            const float ctr_x = xx1 + (w / 2);
+            const float ctr_y = yy1 + (h / 2);
+
+            // If suppress_size is specified, filter the boxes based on their size and position
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
+            if(ovr <= info.nms() && keep_size)
+            {
+                new_indices.push_back(j);
+            }
+        }
+
+        const unsigned int new_indices_size = new_indices.size();
+        std::vector<int>   new_sorted_indices(new_indices_size);
+        for(unsigned int i = 0; i < new_indices_size; ++i)
+        {
+            new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
+        }
+        sorted_indices = new_sorted_indices;
+    }
+
+    return keep;
+}
+} // namespace
+
+CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
+    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+      _info()
+{
+}
+
+bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <typename T>
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
+{
+    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     num_classes  = _scores_in->info()->dimension(0);
+    const int                     scores_count = _scores_in->info()->dimension(1);
+    std::vector<int>              total_keep_per_batch(batch_size);
+    std::vector<std::vector<int>> keeps(num_classes);
+    int                           total_keep_count = 0;
+
+    std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
+    for(int i = 0; i < scores_count; ++i)
+    {
+        for(int j = 0; j < num_classes; ++j)
+        {
+            in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
+        }
+    }
+
+    int offset        = 0;
+    int cur_start_idx = 0;
+    for(int b = 0; b < batch_size; ++b)
+    {
+        const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
+        // Skip first class if there is more than 1 except if the number of classes is 1.
+        const int j_start = (num_classes == 1 ? 0 : 1);
+        for(int j = j_start; j < num_classes; ++j)
+        {
+            std::vector<T>   cur_scores(scores_count);
+            std::vector<int> inds;
+            for(int i = 0; i < scores_count; ++i)
+            {
+                const T score = in_scores[j][i];
+                cur_scores[i] = score;
+
+                if(score > _info.score_thresh())
+                {
+                    inds.push_back(i);
+                }
+            }
+            if(_info.soft_nms_enabled())
+            {
+                keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
+            }
+            else
+            {
+                std::sort(inds.data(), inds.data() + inds.size(),
+                          [&cur_scores](int lhs, int rhs)
+                {
+                    return cur_scores[lhs] > cur_scores[rhs];
+                });
+
+                keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
+            }
+            total_keep_count += keeps[j].size();
+        }
+
+        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        {
+            // merge all scores (represented by indices) together and sort
+            auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
+            {
+                std::vector<T> ret(total_keep_count);
+
+                int ret_idx = 0;
+                for(unsigned int i = 1; i < keeps.size(); ++i)
+                {
+                    auto &cur_keep = keeps[i];
+                    for(auto &ckv : cur_keep)
+                    {
+                        ret[ret_idx++] = in_scores[i][ckv];
+                    }
+                }
+
+                std::sort(ret.data(), ret.data() + ret.size());
+
+                return ret;
+            };
+
+            auto    all_scores_sorted = get_all_scores_sorted();
+            const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
+            for(int j = 1; j < num_classes; ++j)
+            {
+                auto            &cur_keep = keeps[j];
+                std::vector<int> new_keeps_j;
+                for(auto &k : cur_keep)
+                {
+                    if(in_scores[j][k] >= image_thresh)
+                    {
+                        new_keeps_j.push_back(k);
+                    }
+                }
+                keeps[j] = new_keeps_j;
+            }
+            total_keep_count = _info.detections_per_im();
+        }
+
+        total_keep_per_batch[b] = total_keep_count;
+
+        // Write results
+        int cur_out_idx = 0;
+        for(int j = j_start; j < num_classes; ++j)
+        {
+            auto     &cur_keep        = keeps[j];
+            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
+
+            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            {
+                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
+                cur_out_classes[k]    = static_cast<T>(j);
+                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+            }
+
+            cur_out_idx += cur_keep.size();
+        }
+
+        if(_keeps != nullptr)
+        {
+            cur_out_idx = 0;
+            for(int j = 0; j < num_classes; ++j)
+            {
+                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                {
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                }
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                cur_out_idx += keeps[j].size();
+            }
+        }
+
+        offset += num_boxes;
+        cur_start_idx += total_keep_count;
+    }
+
+    if(_batch_splits_out != nullptr)
+    {
+        for(int b = 0; b < batch_size; ++b)
+        {
+            *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
+        }
+    }
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
+    const unsigned int num_classes = scores_in->info()->dimension(0);
+
+    ARM_COMPUTE_UNUSED(num_classes);
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+
+    ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
+    if(keeps != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
+        ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
+        ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
+    }
+    if(batch_splits_in != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
+    }
+    if(batch_splits_out != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
+    }
+
+    _scores_in        = scores_in;
+    _boxes_in         = boxes_in;
+    _batch_splits_in  = batch_splits_in;
+    _scores_out       = scores_out;
+    _boxes_out        = boxes_out;
+    _classes          = classes;
+    _batch_splits_out = batch_splits_out;
+    _keeps            = keeps;
+    _keeps_size       = keeps_size;
+    _info             = info;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*scores_in->info(), Steps(scores_in->info()->dimension(0)));
+
+    IKernel::configure(win);
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+
+    switch(_scores_in->info()->data_type())
+    {
+        case DataType::F32:
+            run_nmslimit<float>();
+            break;
+        case DataType::F16:
+            run_nmslimit<half>();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp
new file mode 100644
index 0000000..2d4c0ce
--- /dev/null
+++ b/src/core/CPP/kernels/CPPFlipWeightsKernel.cpp

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+CPPFlipWeightsKernel::CPPFlipWeightsKernel()
+    : _input(nullptr), _output(nullptr), _func(nullptr)
+{
+}
+
+template <typename T>
+void CPPFlipWeightsKernel::flip_weights(const Window &window_input)
+{
+    // Create iterators
+    Iterator in(_input, window_input);
+
+    const DataLayout data_layout = _input->info()->data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const int kernel_width  = _input->info()->dimension(idx_w);
+    const int kernel_height = _input->info()->dimension(idx_h);
+
+    execute_window_loop(window_input, [&](const Coordinates & id)
+    {
+        const unsigned int x = kernel_width - id[idx_w] - 1;
+        const unsigned int y = kernel_height - id[idx_h] - 1;
+        Coordinates        output_coord(id);
+        output_coord.set(idx_w, x);
+        output_coord.set(idx_h, y);
+        *(reinterpret_cast<T *>(_output->ptr_to_element(output_coord))) = *(reinterpret_cast<const T *>(in.ptr()));
+    },
+    in);
+}
+
+void CPPFlipWeightsKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The CPPFlipWeightsKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    ICPPKernel::configure(win);
+
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = &CPPFlipWeightsKernel::flip_weights<float>;
+            break;
+        case DataType::F16:
+            _func = &CPPFlipWeightsKernel::flip_weights<half>;
+            break;
+        case DataType::QASYMM8:
+            _func = &CPPFlipWeightsKernel::flip_weights<uint8_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+
+void CPPFlipWeightsKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 5e7609c..40b5a2b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -675,6 +675,7 @@
     {
         tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
 
+        // FIXME: need odd/even detection for tmp_out_offset?
         mediump vec2 bias_vec = vec2(1.0f, 1.0f);
         STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
     }

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index a65f980..e51908b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -169,6 +169,7 @@
     vec4 c20 = vec4(0.0f);
     vec4 c30 = vec4(0.0f);
 
+    // FIXME: loop unrolling really needed for GLES?
     for(; int(CURRENT_ITEM_OFFSET(src1_iter)) <= (end_row_mtx_b - 8); TENSOR_ITERATOR_ADVANCE(src0_iter, 8), TENSOR_ITERATOR_ADVANCE(src1_iter, 8))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
@@ -1061,6 +1062,7 @@
     c30[0] = vec4(0.0f);
     c30[1] = vec4(0.0f);
 
+    // FIXME: loop unrolling really needed for GLES?
     for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 8b0d41f..ecff233 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -64,6 +64,8 @@
     if(bias != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        // FIXME: Bug in framework, workaround it in tests currently.
+        //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
     }
 
@@ -130,6 +132,7 @@
             switch(input->info()->data_type())
             {
                 case DataType::F16:
+                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
 #define PROCESS_4X_3Y_1Z
 
 #if defined(PROCESS_8X_3Y_1Z)
@@ -177,6 +180,7 @@
                     break;
             }
         }
+        // FIXME: Just keep one in release
         else
         {
             switch(input->info()->data_type())
@@ -188,6 +192,7 @@
                     break;
 
                 case DataType::F32:
+                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
 #define PROCESS_4X_1Y_1Z
 
 #if defined(PROCESS_1X_1Y_1Z)

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index fac2902..03463b2 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp

@@ -36,26 +36,75 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    const int              mean_padding = ceil_to_multiple(mean->dimension(0), num_elems_processed_per_iteration) - mean->dimension(0);
+    const int              std_padding  = ceil_to_multiple(std->dimension(0), num_elems_processed_per_iteration) - std->dimension(0);
+    AccessWindowStatic     mean_access(mean, 0, 0, mean->dimension(0) + mean_padding, mean->dimension(1));
+    AccessWindowStatic     std_access(std, 0, 0, std->dimension(0) + std_padding, std->dimension(1));
+
+    const bool window_changed = update_window_and_padding(win, input_access, output_access, mean_access, std_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 GCNormalizePlanarYUVLayerKernel::GCNormalizePlanarYUVLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _sd(nullptr)
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
 {
 }
 
-void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, sd);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, sd);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
 
     _input  = input;
     _output = output;
     _mean   = mean;
-    _sd     = sd;
-
-    const unsigned int num_elems_processed_per_iteration = 4;
+    _std    = std;
 
     // Set build options
     std::set<std::string> build_opts;
@@ -67,19 +116,17 @@
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalize_planar_yuv_layer", build_opts));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    const int              mean_padding = ceil_to_multiple(mean->info()->dimension(0), num_elems_processed_per_iteration) - mean->info()->dimension(0);
-    const int              sd_padding   = ceil_to_multiple(sd->info()->dimension(0), num_elems_processed_per_iteration) - sd->info()->dimension(0);
-    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + mean_padding, mean->info()->dimension(1));
-    AccessWindowStatic     sd_access(sd->info(), 0, 0, sd->info()->dimension(0) + sd_padding, sd->info()->dimension(1));
+    IGCKernel::configure(std::get<1>(win_config));
+}
 
-    update_window_and_padding(win, input_access, output_access, mean_access, sd_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    IGCKernel::configure(win);
+Status GCNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get())));
+    return Status{};
 }
 
 void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
@@ -100,7 +147,7 @@
 
     unsigned int idx = 2 * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, 3, slice_in);
-    add_1D_tensor_argument(idx, _sd, 4, slice_in);
+    add_1D_tensor_argument(idx, _std, 4, slice_in);
 
     slice_in = window.first_slice_window_3D();
 

diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index a14a9c9..78e2df1 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp

@@ -51,6 +51,14 @@
     {
         return arm_compute::GPUTarget::G51LIT;
     }
+    else if(version == "G52")
+    {
+        return arm_compute::GPUTarget::G52;
+    }
+    else if(version == "G52LIT")
+    {
+        return arm_compute::GPUTarget::G52LIT;
+    }
     else if(version == "G76")
     {
         return arm_compute::GPUTarget::G76;
@@ -106,6 +114,8 @@
         { GPUTarget::G51, "g51" },
         { GPUTarget::G51BIG, "g51big" },
         { GPUTarget::G51LIT, "g51lit" },
+        { GPUTarget::G52, "g52" },
+        { GPUTarget::G52LIT, "g52lit" },
         { GPUTarget::G76, "g76" },
         { GPUTarget::TTRX, "ttrx" },
         { GPUTarget::TBOX, "tbox" }

diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index c73f4e7..be65102 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp

@@ -102,6 +102,16 @@
         return false;
     }
 
+    PaddingSize needed = get_needed_padding(window);
+    PaddingSize available = _info->padding();
+
+    if(needed.top <= available.top && needed.right <= available.right
+    && needed.bottom <= available.bottom
+    && needed.left <= available.left)
+    {
+        return false;
+    }
+
     const TensorShape &shape                = _info->tensor_shape();
     const Strides     &strides              = _info->strides_in_bytes();
     const size_t       offset_first_element = _info->offset_first_element_in_bytes();
@@ -206,7 +216,12 @@
     {
         return false;
     }
+    // Update strides in tensor info
+    return _info->extend_padding( get_needed_padding(window));
+}
 
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+{
     ARM_COMPUTE_ERROR_ON(_scale_x == 0);
     ARM_COMPUTE_ERROR_ON(_scale_y == 0);
 
@@ -223,6 +238,5 @@
     padding.top    = std::max(0, -min_y);
     padding.bottom = std::max<int>(0, max_y - shape[1]);
 
-    // Update strides in tensor info
-    return _info->extend_padding(padding);
+    return padding;
 }

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 7a92c6b..5ce79f1 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -184,7 +184,7 @@
     Iterator output(_output, window);
 
     static const float16x8_t CONST_0   = vdupq_n_f16(0.f);
-    static const float16x4_t CONST_1_H = vdup_n_f16(1.f);
+    static const float16x8_t CONST_1_H = vdupq_n_f16(1.f);
 
     static const float32x4_t CONST_1_F32 = vdupq_n_f32(1.f);
 
@@ -240,23 +240,11 @@
                 break;
             case ActivationFunction::LOGISTIC:
             {
-                const float16x4x2_t in0 =
-                {
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[0]))))))),
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[0]))))))),
-                };
-
-                const float16x4x2_t in1 =
-                {
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_low_f16(in.val[1]))))))),
-                    vinv_f16(vadd_f16(CONST_1_H, vcvt_f16_f32(vexpq_f32(vcvt_f32_f16(vneg_f16(vget_high_f16(in.val[1]))))))),
-                };
-
                 tmp =
                 {
                     {
-                        vcombine_f16(in0.val[0], in0.val[1]),
-                        vcombine_f16(in1.val[0], in1.val[1]),
+                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[0])))),
+                        vinvq_f16(vaddq_f16(CONST_1_H, vexpq_f16(vnegq_f16(in.val[1]))))
                     }
                 };
             }
@@ -281,6 +269,7 @@
                 break;
             case ActivationFunction::SOFT_RELU:
             {
+                // TODO (COMPMID-1535) : Revisit FP16 approximations
                 const float16x4x2_t in0 =
                 {
                     vcvt_f16_f32(vlogq_f32(vaddq_f32(CONST_1_F32, vexpq_f32(vcvt_f32_f16(vget_low_f16(in.val[0])))))),
@@ -322,6 +311,7 @@
                 break;
             case ActivationFunction::TANH:
             {
+                // TODO (COMPMID-1535) : Revisit FP16 approximations
                 const float16x8x2_t mul =
                 {
                     vmulq_f16(b, in.val[0]),

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a6102b1..169554f 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -456,7 +456,7 @@
 
 Status NEArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);

diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 3c76548..ff8fb84 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp

@@ -46,10 +46,12 @@
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
 void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -64,8 +66,8 @@
 
 void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -80,8 +82,8 @@
 
 void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -104,8 +106,8 @@
 
 void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -144,8 +146,8 @@
 void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -167,8 +169,8 @@
 
 void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -192,8 +194,8 @@
 }
 void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -213,8 +215,8 @@
 
 void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -234,8 +236,8 @@
 
 void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -255,8 +257,8 @@
 
 void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -276,8 +278,8 @@
 
 void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -298,8 +300,8 @@
 
 void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
+    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
+    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
     execute_window_loop(window, [&](const Coordinates & id)
@@ -318,43 +320,71 @@
     input1, input2, output);
 }
 
-inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
-        && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
-        && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
-        "You called subtract with the wrong image formats");
+    const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
+    // Validate in case of configured output
+    if(output.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
+            && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
+            "You called subtract with the wrong image formats");
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
+                                        "Wrong shape for output");
+    }
     return Status{};
 }
 
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(output, out_shape);
 
-    bool window_changed = update_window_and_padding(win,
-                                                    AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration),
-                                                    AccessWindowHorizontal(input2, 0, num_elems_processed_per_iteration),
-                                                    output_access);
+        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
+        {
+            set_format_if_unknown(output, Format::S16);
+        }
+        else if(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16)
+        {
+            set_format_if_unknown(output, Format::F16);
+        }
+        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
+        {
+            set_format_if_unknown(output, Format::F32);
+        }
+    }
 
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
+    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
+    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
+    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+
+    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win_input1, input1_access)
+                          || update_window_and_padding(win_input2, input2_access)
+                          || update_window_and_padding(win, output_access);
 
     output_access.set_valid_region(win, valid_region);
 
@@ -371,26 +401,11 @@
 void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
 
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-        if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output->info(), Format::S16);
-        }
-        else if(input1->info()->data_type() == DataType::F16 || input2->info()->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output->info(), Format::F16);
-        }
-        else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output->info(), Format::F32);
-        }
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
     {
@@ -427,16 +442,15 @@
         _func = it->second;
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
 Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -450,3 +464,10 @@
 
     (*_func)(_input1, _input2, _output, window);
 }
+
+BorderSize NEArithmeticSubtractionKernel::border_size() const
+{
+    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize(0, border, 0, 0);
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index ac1fc39..683d48b 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -45,13 +45,11 @@
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     if(act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
-        ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
                                     && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
                                     && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
@@ -102,16 +100,16 @@
 }
 } //namespace
 
-template <bool fused_activation>
+template <bool fused_activation, typename F>
 void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
 {
-    static_assert(!fused_activation, "Activation is not supported for FP16");
-
     ARM_COMPUTE_UNUSED(window);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window);
     Iterator output(_output, window);
 
+    F activation_functor(_act_info);
+
     // Hold information about the current feature map we are iterating.
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
@@ -151,22 +149,30 @@
         // Calculate x bar and store results
         const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
         const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
+
+        // Perform fused activation
+        if(fused_activation)
+        {
+            activation_functor(res);
+        }
+
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
     },
     input, output);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <bool fused_activation>
+template <bool fused_activation, typename F>
 void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
 {
-    static_assert(!fused_activation, "Activation is not supported for FP16");
-
     ARM_COMPUTE_UNUSED(window);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window);
     Iterator output(_output, window);
 
+    F activation_functor(_act_info);
+
     const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
     const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
@@ -186,7 +192,15 @@
         // Calculate x bar and store results
         const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
         const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
+        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
+
+        // Perform fused activation
+        if(fused_activation)
+        {
+            activation_functor(res);
+        }
+
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
     },
     input, output);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -299,9 +313,12 @@
     const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
     switch(_input->info()->data_type())
     {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false, ::detail::dummy<float16_t, 8>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false, ::detail::dummy<float16_t, 8>>;
             break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
             _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
                     &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
@@ -328,9 +345,30 @@
         { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
         { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
     };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    // NCHW Fused Batched Normalization with activation functions : FP16
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
+    {
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::lubrelu<float16_t, 8>> }
+    };
+    // NHWC Fused Batched Normalization with activation functions : FP16
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
+    {
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::lubrelu<float16_t, 8>> }
+    };
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
     switch(_input->info()->data_type())
     {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f16_nhwc[_act_info.activation()] : bn_fused_map_f16_nchw[_act_info.activation()];
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
             _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
             break;

diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index dc37452..fa51a7b 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp

@@ -51,744 +51,6 @@
 constexpr int MAYBE   = 127;
 } // namespace
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // Constant use for evaluating score1 and score3
-    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
-    static const float32x4_t zero    = vdupq_n_f32(0.0f);
-    static const float32x4_t one     = vdupq_n_f32(1.0f);
-    static const float32x4_t two     = vdupq_n_f32(2.0f);
-    static const float32x4_t three   = vdupq_n_f32(3.0f);
-
-    // Score0: (1, 0)
-    const float32x4x2_t score0 =
-    {
-        vabsq_f32(gx.val[0]),
-        vabsq_f32(gx.val[1])
-    };
-
-    // Score2: ( 0, 1 )
-    const float32x4x2_t score2 =
-    {
-        vabsq_f32(gy.val[0]),
-        vabsq_f32(gy.val[1])
-    };
-
-    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
-    float32x4x2_t score1 =
-    {
-        vmulq_f32(gy.val[0], const45),
-        vmulq_f32(gy.val[1], const45)
-    };
-
-    float32x4x2_t score3 = score1;
-
-    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
-    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
-    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
-    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
-
-    score1.val[0] = vabsq_f32(score1.val[0]);
-    score1.val[1] = vabsq_f32(score1.val[1]);
-    score3.val[0] = vabsq_f32(score3.val[0]);
-    score3.val[1] = vabsq_f32(score3.val[1]);
-
-    float32x4x2_t phase =
-    {
-        zero,
-        zero
-    };
-
-    float32x4x2_t old_score = score0;
-
-    // score1 > old_score?
-    uint32x4x2_t mask =
-    {
-        vcgtq_f32(score1.val[0], old_score.val[0]),
-        vcgtq_f32(score1.val[1], old_score.val[1])
-    };
-
-    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
-
-    // score2 > old_score?
-    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
-
-    // score3 > old_score?
-    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
-
-    // Convert from float32x4_t to uint8x8_t
-    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
-                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
-}
-
-inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy)
-{
-    // Constant use for evaluating score1 and score3
-    static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f);
-    static const float16x8_t zero    = vdupq_n_f16(0.0f);
-    static const float16x8_t one     = vdupq_n_f16(1.0f);
-    static const float16x8_t two     = vdupq_n_f16(2.0f);
-    static const float16x8_t three   = vdupq_n_f16(3.0f);
-
-    // Score0: (1, 0)
-    const float16x8_t score0 = vabsq_f16(gx);
-
-    // Score2: ( 0, 1 )
-    const float16x8_t score2 = vabsq_f16(gy);
-
-    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
-    float16x8_t score1 = vmulq_f16(gy, const45);
-    float16x8_t score3 = score1;
-
-    score1 = vfmaq_f16(score1, gx, const45);
-    score3 = vfmsq_f16(score3, gx, const45);
-
-    score1 = vabsq_f16(score1);
-    score3 = vabsq_f16(score3);
-
-    float16x8_t phase     = zero;
-    float16x8_t old_score = score0;
-
-    // score1 > old_score?
-    uint16x8_t mask = vcgtq_f16(score1, old_score);
-
-    phase     = vbslq_f16(mask, one, phase);
-    old_score = vbslq_f16(mask, score1, old_score);
-
-    // score2 > old_score?
-    mask = vcgtq_f16(score2, old_score);
-
-    phase     = vbslq_f16(mask, two, phase);
-    old_score = vbslq_f16(mask, score2, old_score);
-
-    // score3 > old_score?
-    mask = vcgtq_f16(score3, old_score);
-
-    phase = vbslq_f16(mask, three, phase);
-
-    // Convert from float16x8_t to uint8x8_t
-    return vmovn_u16(vcvtq_u16_f16(phase));
-}
-
-/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
- *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy));
-}
-
-/** Computes the gradient phase if gradient_size = 7. The output is quantized.
- *         0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Convert to float
-    const float32x4x2_t gx_f32 =
-    {
-        vcvtq_f32_s32(gx.val[0]),
-        vcvtq_f32_s32(gx.val[1])
-    };
-
-    const float32x4x2_t gy_f32 =
-    {
-        vcvtq_f32_s32(gy.val[0]),
-        vcvtq_f32_s32(gy.val[1])
-    };
-
-    return phase_quantization(gx_f32, gy_f32);
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
-                     vreinterpretq_u16_s16(vabsq_s16(gy)));
-}
-
-/** Computes the magnitude using the L1-norm type if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    const uint32x4x2_t gx_abs =
-    {
-        vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
-        vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
-    };
-
-    const uint32x4x2_t gy_abs =
-    {
-        vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
-        vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
-    };
-
-    const uint32x4x2_t out =
-    {
-        vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
-        vaddq_u32(gx_abs.val[1], gy_abs.val[1])
-    };
-
-    return out;
-}
-
-inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // x^2 ...
-    float32x4x2_t mag =
-    {
-        vmulq_f32(gx.val[0], gx.val[0]),
-        vmulq_f32(gx.val[1], gx.val[1])
-    };
-
-    // ... + y^2
-    mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]);
-    mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]);
-
-    // sqrt(...)
-    mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]);
-    mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]);
-
-    return mag;
-}
-
-inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy)
-{
-    // x^2 ...
-    float16x8_t mag = vmulq_f16(gx, gx);
-
-    // ... + y^2
-    mag = vfmaq_f16(mag, gy, gy);
-
-    // sqrt(...)
-    mag = vmulq_f16(vrsqrteq_f16(mag), mag);
-
-    return mag;
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    /* Compute magnitude using L2 normalization */
-    const float16x8_t gx2 = vcvtq_f16_s16(gx);
-    const float16x8_t gy2 = vcvtq_f16_s16(gy);
-    const float16x8_t mag = mag_l2(gx2, gy2);
-
-    /* Store magnitude - Convert to uint16x8 */
-    return vcvtq_u16_f16(mag);
-}
-
-/** Computes the magnitude using L2-norm if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Compute magnitude using L2 normalization
-    float32x4x2_t gx2 =
-    {
-        vcvtq_f32_s32(gx.val[0]),
-        vcvtq_f32_s32(gx.val[1])
-    };
-
-    float32x4x2_t gy2 =
-    {
-        vcvtq_f32_s32(gy.val[0]),
-        vcvtq_f32_s32(gy.val[1])
-    };
-
-    const float32x4x2_t mag = mag_l2(gx2, gy2);
-    const uint32x4x2_t  mag32 =
-    {
-        vcvtq_u32_f32(mag.val[0]),
-        vcvtq_u32_f32(mag.val[1])
-    };
-
-    return mag32;
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
-    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
-    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
-    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    const int16x8x4_t gx =
-    {
-        vld1q_s16(in1),
-        vld1q_s16(in1 + 8),
-        vld1q_s16(in1 + 16),
-        vld1q_s16(in1 + 24)
-    };
-
-    const int16x8x4_t gy =
-    {
-        vld1q_s16(in2),
-        vld1q_s16(in2 + 8),
-        vld1q_s16(in2 + 16),
-        vld1q_s16(in2 + 24)
-    };
-
-    // Compute and store phase
-    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
-    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
-    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
-    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
-    // Compute ans store magnitude using L1 normalization
-    vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0]));
-    vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1]));
-    vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2]));
-    vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S16
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S16
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    const auto in1  = static_cast<const int16_t *__restrict>(in1_ptr);
-    const auto in2  = static_cast<const int16_t *__restrict>(in2_ptr);
-    const auto out1 = static_cast<uint16_t *__restrict>(out1_ptr);
-    const auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    const int16x8x4_t gx =
-    {
-        vld1q_s16(in1),
-        vld1q_s16(in1 + 8),
-        vld1q_s16(in1 + 16),
-        vld1q_s16(in1 + 24)
-    };
-
-    const int16x8x4_t gy =
-    {
-        vld1q_s16(in2),
-        vld1q_s16(in2 + 8),
-        vld1q_s16(in2 + 16),
-        vld1q_s16(in2 + 24)
-    };
-
-    // Compute and store phase
-    vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0]));
-    vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1]));
-    vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2]));
-    vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3]));
-
-    // Compute and store magnitude using L2 normalization
-    vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0]));
-    vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1]));
-    vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2]));
-    vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3]));
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
-    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
-    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
-    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            vld1q_s32(in1 + 0),
-            vld1q_s32(in1 + 4)
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            vld1q_s32(in1 + 8),
-            vld1q_s32(in1 + 12)
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            vld1q_s32(in2 + 0),
-            vld1q_s32(in2 + 4)
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            vld1q_s32(in2 + 8),
-            vld1q_s32(in2 + 12)
-        };
-
-        // Compute and store phase
-        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L1 normalization
-        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(out1 + 0, mag0.val[0]);
-        vst1q_u32(out1 + 4, mag0.val[1]);
-        vst1q_u32(out1 + 8, mag1.val[0]);
-        vst1q_u32(out1 + 12, mag1.val[1]);
-    }
-}
-
-/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
- *
- * @param[in]  in1_ptr  Pointer to source image. Gx image. Data type supported S32
- * @param[in]  in2_ptr  Pointer to source image. Gy image. Data type supported S32
- * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr)
-{
-    auto in1  = static_cast<const int32_t *__restrict>(in1_ptr);
-    auto in2  = static_cast<const int32_t *__restrict>(in2_ptr);
-    auto out1 = static_cast<uint32_t *__restrict>(out1_ptr);
-    auto out2 = static_cast<uint8_t *__restrict>(out2_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            vld1q_s32(in1 + 0),
-            vld1q_s32(in1 + 4)
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            vld1q_s32(in1 + 8),
-            vld1q_s32(in1 + 12)
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            vld1q_s32(in2 + 0),
-            vld1q_s32(in2 + 4)
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            vld1q_s32(in2 + 8),
-            vld1q_s32(in2 + 12)
-        };
-
-        // Compute and store phase
-        vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L2 normalization
-        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(out1 + 0, mag0.val[0]);
-        vst1q_u32(out1 + 4, mag0.val[1]);
-        vst1q_u32(out1 + 8, mag1.val[0]);
-        vst1q_u32(out1 + 12, mag1.val[1]);
-    }
-}
-
-inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
-{
-    // Phase for 4 pixel
-    const uint32x4_t pc32 = vmovl_u16(pc);
-
-    // Get magnitude for 4 pixel
-    uint32x4_t mc = vld1q_u32(in);
-
-    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
-    // 0 degree
-    const uint32x4_t mk0_0 = vld1q_u32(in - 1);
-    const uint32x4_t mk0_1 = vld1q_u32(in + 1);
-    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
-
-    // 45 degree
-    const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1);
-    const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1);
-    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
-
-    // 90 degree
-    const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag);
-    const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag);
-    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
-
-    // 135 degree
-    const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1);
-    const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1);
-    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
-
-    // Merge masks
-    mask0 = vorrq_u32(mask0, mask1);
-    mask2 = vorrq_u32(mask2, mask3);
-    mask0 = vorrq_u32(mask0, mask2);
-
-    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
-
-    // mc > upper_thr
-    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
-
-    // mc <= lower_thr
-    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
-
-    // mc <= upper_thr && mc > lower_thr
-    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
-    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
-
-    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
-    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
-    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
-
-    return vmovn_u32(mc);
-}
-
-/** Computes edge tracing when is called by edge_trace_U8_U8 recursively
- *
- * @param[in]  in         Pointer to source image. Data type supported U8
- * @param[out] out        Pointer to destination image. Data type supported U8
- * @param[in]  in_stride  Stride of the input image
- * @param[in]  out_stride Stride of the output image
- */
-void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride)
-{
-    // Look for MAYBE pixels in 8 directions
-    *out = EDGE;
-
-    // (-1, 0)
-    uint8_t pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (+1, 0)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-
-    in -= in_stride;
-    out -= out_stride;
-
-    // (-1, -1)
-    pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (0, -1)
-    pixel = *in;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *in = EDGE;
-
-        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
-    }
-
-    // (+1, -1)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-
-    in += in_stride * 2;
-    out += out_stride * 2;
-
-    // (-1, +1)
-    pixel = *(in - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride);
-    }
-
-    // (0, +1)
-    pixel = *in;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *in = EDGE;
-
-        edge_trace_recursive_U8_U8(in, out, in_stride, out_stride);
-    }
-
-    // (+1, +1)
-    pixel = *(in + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(in + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride);
-    }
-}
-} // namespace fp16
-
-void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
-
-    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
-    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
-
-    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
-    set_format_if_unknown(*magnitude->info(), magnitude_format);
-    set_format_if_unknown(*phase->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    if(_gx->info()->data_type() == DataType::S16)
-    {
-        if(norm_type == 1)
-        {
-            _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8;
-        }
-        else
-        {
-            _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8;
-        }
-    }
-    else
-    {
-        if(norm_type == 1)
-        {
-            _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8;
-        }
-        else
-        {
-            _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8;
-        }
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 32;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
-
-    mag_access.set_valid_region(win, _gx->info()->valid_region());
-    phase_access.set_valid_region(win, _gx->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 namespace
 {
 inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)

diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
new file mode 100644
index 0000000..f8217d3
--- /dev/null
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp

@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input,
+                                                         1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
+
+    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    }
+
+    return Status{};
+}
+void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
+{
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const size_t       element_size = input->info()->element_size();
+    const unsigned int K            = input->info()->dimension(channel_idx) / num_groups;
+    const float        rK           = 1.f / K;
+
+    Iterator in(input, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Shuffle channel
+        const unsigned int curr_channel = id.x();
+        const unsigned int group_id     = curr_channel * rK;
+        const unsigned int r            = group_id * K;
+        const unsigned int channel_id   = curr_channel - r;
+
+        // Calculate output coordinates
+        Coordinates out_coords = id;
+        out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+        std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+    },
+    in);
+}
+void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
+{
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const unsigned int height          = input->info()->tensor_shape().y();
+    const size_t       input_stride_y  = input->info()->strides_in_bytes().y();
+    const size_t       output_stride_y = output->info()->strides_in_bytes().y();
+    const size_t       row_size        = input->info()->dimension(width_idx) * input->info()->element_size();
+
+    const unsigned int K  = input->info()->dimension(channel_idx) / num_groups;
+    const float        rK = 1.f / K;
+
+    Iterator in(input, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        // Shuffle channel
+        const unsigned int curr_channel = id.z();
+        const unsigned int group_id     = curr_channel * rK;
+        const unsigned int r            = group_id * K;
+        const unsigned int channel_id   = curr_channel - r;
+
+        // Calculate output coordinates
+        Coordinates out_coords = id;
+        out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+        const uint8_t *input_ptr  = in.ptr();
+        uint8_t       *output_ptr = output->ptr_to_element(out_coords);
+
+        // Copy plane
+        for(unsigned int y = 0; y < height; ++y)
+        {
+            std::copy_n(input_ptr, row_size, output_ptr);
+            input_ptr += input_stride_y;
+            output_ptr += output_stride_y;
+        }
+    },
+    in);
+}
+} // namespace
+
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
+    : _input(nullptr), _output(nullptr), _num_groups()
+{
+}
+
+void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+
+    _input      = input;
+    _output     = output;
+    _num_groups = num_groups;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The NEChannelShuffleLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
+Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
+    return Status{};
+}
+
+void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    switch(_input->info()->data_layout())
+    {
+        case DataLayout::NHWC:
+            channel_shuffle_nhwc(_input, _output, _num_groups, window);
+            break;
+        case DataLayout::NCHW:
+            channel_shuffle_nchw(_input, _output, _num_groups, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data layout!");
+            break;
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index bb8e758..d6517ac 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp

@@ -29,26 +29,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 
 using namespace arm_compute;
+using namespace misc::shape_calculator;
 
 namespace
 {
-TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_dims)
-{
-    TensorShape output_shape = input->tensor_shape();
-    output_shape.set(0, convolved_dims.width);
-    output_shape.set(1, convolved_dims.height);
-    output_shape.set(2, input->tensor_shape()[0]);
-    output_shape.set(3, input->tensor_shape()[3]); // For NEON the batch size is on the fourth dimension of the input tensor
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -60,12 +51,28 @@
     // Validate configured output
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
 }
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims)
+{
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, false)));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
 } // namespace
 
 template <typename T>
@@ -102,11 +109,6 @@
 void NECol2ImKernel::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), convolved_dims)));
-
-    // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
 
     _input          = input;
@@ -130,19 +132,15 @@
     }
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), convolved_dims);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 Status NECol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first);
     return Status{};
 }
 

diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 4582c88..7a66b6c 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp

@@ -112,6 +112,10 @@
                     _func                             = colorconvert_rgb_to_rgbx;
                     num_elems_processed_per_iteration = 16;
                     break;
+                case Format::U8:
+                    _func                             = colorconvert_rgb_to_u8;
+                    num_elems_processed_per_iteration = 16;
+                    break;
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
                     break;

diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index d2eac2c..31b688c 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp

@@ -102,7 +102,7 @@
     }
     else
     {
-        const float diff = image_size - 1;
+        const float diff = image_size - cd_min;
 
         for(unsigned int x = 0; x < _histogram_size; ++x)
         {

diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 8280b52..158f401 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -34,68 +35,90 @@
 
 using namespace arm_compute;
 
-namespace arm_compute
+namespace
 {
-class Coordinates;
-} // namespace arm_compute
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON(input == output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16
+                                                                           && output->data_type() != DataType::S32),
+                                    "Only data_types supported [in] U8 -> [out] U16, S16, S32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32),
+                                    "Only data_types supported [in] U16 ->  [out] U8, U32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32),
+                                    "Only data_types supported [in] S16 ->  [out] U8, S32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32,
+                                    "Only data_types supported [in] F16 ->  [out] F32");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16,
+                                    "Only data_types supported [in] F32 ->  [out] F16");
+
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, output->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
     : _input(nullptr), _output(nullptr), _policy(), _shift(0)
 {
 }
 
-void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
 
     _input  = input;
-    _output = input;
+    _output = output;
     _policy = policy;
     _shift  = shift;
 
-    if(output != nullptr)
-    {
-        // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
-        set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-        // Set output
-        _output = output;
-    }
-
-    ARM_COMPUTE_ERROR_ON(shift >= 8);
-    ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
-                                                                            && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] U8 -> [out] U16, S16, S32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
-                             "Only data_types supported [in] U16 ->  [out] U8, U32");
-
-    ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
-                             "Only data_types supported [in] S16 ->  [out] U8, S32");
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
+}
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, input->info()->valid_region());
-    }
-    else
-    {
-        // In-place computation
-        update_window_and_padding(win, input_access);
-    }
-    ICPPKernel::configure(win);
+Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
 }
 
 void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info)
@@ -103,8 +126,7 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(nullptr == _input);
-    ARM_COMPUTE_ERROR_ON(nullptr == _output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
     ARM_COMPUTE_ERROR_ON(_input == _output);
 
     Iterator input(_input, window);
@@ -341,6 +363,68 @@
             }
             break;
         }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            switch(_output->info()->data_type())
+            {
+                case DataType::F32:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1 << _shift);
+
+                    /* Up-conversion F16 -> F32 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float16x8x2_t texels =
+                        {
+                            {
+                                vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())),
+                                vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8)
+                            }
+                        };
+
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
+                        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+        case DataType::F32:
+            switch(_output->info()->data_type())
+            {
+                case DataType::F16:
+                {
+                    const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift));
+
+                    /* Down-conversion F32 -> F16 */
+                    execute_window_loop(window, [&](const Coordinates & id)
+                    {
+                        const float32x4x4_t texels =
+                        {
+                            {
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale),
+                                vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale)
+                            }
+                        };
+
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+                    },
+                    input, output);
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Output data type not supported");
+            }
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 09e4acd..99bdb7a 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -25,7 +25,6 @@
 #include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -147,7 +146,7 @@
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     const DataLayout   data_layout = input->data_layout();
@@ -166,8 +165,14 @@
         const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
 
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (output->data_type() != DataType::S32));
-        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input->data_type()) && (output->data_type() != DataType::F32));
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
     }
 
     return Status{};
@@ -193,8 +198,10 @@
             output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
             output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
 
+            const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
             // Output auto inizialitation if not yet initialized
-            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
 
             // Configure window (optimised)
             // Set padding in channels
@@ -230,6 +237,11 @@
             case DataType::QASYMM8:
                 num_elems_read_per_iteration = 16;
                 break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+                num_elems_read_per_iteration = 24;
+                break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F32:
                 num_elems_read_per_iteration = 12;
                 break;
@@ -314,7 +326,7 @@
     }
 
     // Check supported data type
-    bool supported_datatype = (dt == DataType::F32);
+    bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
 
     // Check for supported strides
     const auto &strides           = conv_info.stride();
@@ -335,11 +347,15 @@
 
 void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
     ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
 
     _convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
+    if(_convolver)
+    {
+        _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
+    }
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -372,6 +388,11 @@
 
     switch(_input->info()->data_type())
     {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
             convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier);
             break;
@@ -399,6 +420,7 @@
                                                                                                                 ITensor       *out,
                                                                                                                 bool           setup_strides)
 {
+    const DataType    dt                  = in->info()->data_type();
     const TensorShape shape               = in->info()->tensor_shape();
     const int         in_rows             = shape.z();
     const int         in_cols             = shape.y();
@@ -415,34 +437,85 @@
     const int         output_batch_stride = (setup_strides) ? out->info()->strides_in_bytes()[3] / out->info()->element_size() : 0;
 
     const auto stride_x = conv_info.stride().first;
-    switch(stride_x)
+    switch(dt)
     {
-        case 1:
-            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
-                       n_batches,
-                       in_rows,
-                       in_cols,
-                       n_channels,
-                       padding_same,
-                       reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
-                       reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
-                       reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
-                       weight_col_stride, weight_row_stride,
-                       input_col_stride, input_row_stride, input_batch_stride,
-                       output_col_stride, output_row_stride, output_batch_stride);
-        case 2:
-            return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
-                       n_batches,
-                       in_rows,
-                       in_cols,
-                       n_channels,
-                       padding_same,
-                       reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
-                       reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
-                       reinterpret_cast<float *>(out->ptr_to_element(Coordinates())),
-                       weight_col_stride, weight_row_stride,
-                       input_col_stride, input_row_stride, input_batch_stride,
-                       output_col_stride, output_row_stride, output_batch_stride);
+        case DataType::QASYMM8:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+                               in->ptr_to_element(Coordinates()),
+                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+                               in->ptr_to_element(Coordinates()),
+                               reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                default:
+                    return nullptr;
+            }
+            break;
+        }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const float16_t *>(w->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float16_t *>(in->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float16_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                default:
+                    return nullptr;
+            }
+            break;
+        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+        {
+            switch(stride_x)
+            {
+                case 1:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                case 2:
+                    return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float>>(
+                               n_batches, in_rows, in_cols, n_channels, padding_same,
+                               reinterpret_cast<const float *>(w->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float *>(in->ptr_to_element(Coordinates())),
+                               reinterpret_cast<float *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+                               weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+                               output_col_stride, output_row_stride, output_batch_stride);
+                default:
+                    return nullptr;
+            }
+            break;
+        }
         default:
             return nullptr;
     }

diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 92ee8d5..e8fb8cd 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 2d17c23..921582a 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"

diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 22a2cf8..77ab5ad 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index 06e6b03..cfed324 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp

@@ -81,9 +81,11 @@
     AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
 
+    // TODO(COMPMID-1503) Fix x-access input bug in NEON kernel instead of '+2'
     AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
     AccessWindowRectangle  in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
 
+    // TODO(COMPMID-1503) Fix x-access input bug in NEON kernel instead of '+2'
     AccessWindowRectangle in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_rows_read_per_iteration);
 
     if(run_der_x && run_der_y)

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index f525d93..162c4b1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -36,6 +36,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <algorithm>
 #include <arm_neon.h>
 
@@ -603,10 +604,9 @@
                                 out_values = internal_vmlal(out_values, in_values, we_values);
                             }
 
-                            out_val += out_values[0];
-                            out_val += out_values[1];
-                            out_val += out_values[2];
-                            out_val += out_values[3];
+                            auto carry_addition = wrapper::vpadd(wrapper::vgethigh(out_values), wrapper::vgetlow(out_values));
+                            carry_addition      = wrapper::vpadd(carry_addition, carry_addition);
+                            out_val += wrapper::vgetlane(carry_addition, 0);
 
                             // Leftover
                             for(; x < input_width; ++x)

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index eefbd98..a571d54 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -194,8 +194,8 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -304,14 +304,14 @@
                 internal_vst1q(out_ptr, internal_vld1q(in_ptr));
             }
         },
-        in, bi);
+        in, bi, out);
     }
 }
 
 // QASYMM8 specializations
 template <>
-void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                                 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                      int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
     uint8x16_t      min                           = vdupq_n_u8(0);
@@ -352,8 +352,8 @@
     in, out);
 }
 template <>
-void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
 {
     ARM_COMPUTE_UNUSED(bias);
 
@@ -382,6 +382,85 @@
     },
     in, out);
 }
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                      int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    Iterator out(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto in_ptr   = reinterpret_cast<int32_t *>(in.ptr());
+        const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+
+        // Accumulate bias
+        int32x4x4_t v_in =
+        {
+            {
+                vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
+                vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
+                vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
+                vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
+            }
+        };
+
+        const auto out_ptr = out.ptr();
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, bi, out);
+}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(bias);
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    Iterator in(input, window);
+    Iterator bi(bias, window_bias);
+
+    Iterator out(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+
+        // Accumulate bias
+        int32x4x4_t v_in =
+        {
+            {
+                vld1q_s32(in_ptr),
+                vld1q_s32(in_ptr + 4),
+                vld1q_s32(in_ptr + 8),
+                vld1q_s32(in_ptr + 12)
+            }
+        };
+
+        const auto out_ptr = out.ptr();
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, bi, out);
+}
 } // namespace
 
 NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
@@ -426,19 +505,19 @@
         {
             case DataType::S32:
             {
-                _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+                _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
-                _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+                _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
-                _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+                _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
                 break;
             }
             default:
@@ -451,6 +530,18 @@
     {
         switch(input->info()->data_type())
         {
+            case DataType::S32:
+            {
+                _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = (output == nullptr) ? &output_stage_nhwc<float16_t, float16_t, true, true> : &output_stage_nhwc<float16_t, float16_t, false, true>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
                 _func = (output == nullptr) ? &output_stage_nhwc<float, float, true, true> : &output_stage_nhwc<float, float, false, true>;

diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
new file mode 100644
index 0000000..b8452fb
--- /dev/null
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp

@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+using namespace misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_flatten_shape(input));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_flatten_shape(input)));
+
+    Window win = calculate_max_window(*input, Steps()); // Flatten does not need paddings
+
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEFlattenLayerKernel::NEFlattenLayerKernel()
+    : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEFlattenLayerKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEFlattenLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    return Status{};
+}
+
+void NEFlattenLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const size_t in_width   = _input->info()->dimension(0);
+    const size_t in_height  = _input->info()->dimension(1);
+    const size_t out_step_x = in_width * _input->info()->element_size();
+    const size_t out_step_y = out_step_x * in_height;
+
+    Window in_window(window);
+    in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+    out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
+
+    Window in_slice  = in_window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_1D();
+
+    do
+    {
+        Iterator in(_input, in_slice);
+        Iterator out(_output, out_slice);
+
+        uint8_t *out_ptr = out.ptr();
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
+        },
+        in);
+    }
+    while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}

diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 872ac26..6551d9e 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
 
+#include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
@@ -33,7 +34,42 @@
 
 #include <arm_neon.h>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    auto_init_if_empty(*output, *input);
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, input->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 void NEFloorKernel::configure(const ITensor *input, ITensor *output)
 {
@@ -42,24 +78,24 @@
     // Auto initialize output
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+Status NEFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 void NEFloorKernel::run(const Window &window, const ThreadInfo &info)
@@ -68,13 +104,34 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    const DataType data_type = _input->info()->data_type();
+
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(data_type == DataType::F32)
     {
-        const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
-    },
-    input, output);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const float32x4_t res = vfloorq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())));
+            vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+        },
+        input, output);
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    else if(data_type == DataType::F16)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const float16x8_t res = vfloorq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())));
+            vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+        },
+        input, output);
+    }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    else
+    {
+        ARM_COMPUTE_ERROR("Invalid data type!");
+    }
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index af84d02..33a5b4a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp

@@ -62,16 +62,24 @@
     if(b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
         if(output_shape.num_dimensions() > 1)
         {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
             TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
             vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(2);
+            output_shape.collapse_from(output_batch_idx);
 
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
             if(a_offset != 0)
@@ -117,6 +125,217 @@
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
+
+template <bool is_gemm3d>
+void run_offset_contribution(const Window &window,
+                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
+                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col)
+{
+    Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
+
+    const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
+    const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+        Iterator mm_result_it(mm_result, window);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(collapsed_window, [&](const Coordinates & id)
+        {
+            const int  batch_id           = id.z() / depth_input;
+            const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(vector_sum_col_ptr + 0),
+                    vld1q_s32(vector_sum_col_ptr + 4),
+                    vld1q_s32(vector_sum_col_ptr + 8),
+                    vld1q_s32(vector_sum_col_ptr + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
+                                                        + (id.z() % depth_input) * height_input);
+            b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+
+            // Add a_offset_term_s32 and b_offset_term_s32
+            int32x4x4_t offset_term_s32 =
+            {
+                {
+                    vdupq_n_s32(k_offset),
+                    vdupq_n_s32(k_offset),
+                    vdupq_n_s32(k_offset),
+                    vdupq_n_s32(k_offset)
+                }
+            };
+
+            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
+            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
+            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
+            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col_it, vector_sum_row_it, mm_result_it);
+    }
+    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
+
+        // Set window for vector_sum_row
+        Window win_vector_sum_row(collapsed_window);
+        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row);
+        Iterator mm_result_it(mm_result, window);
+
+        const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int batch_id = id.z() / depth_input;
+
+            // Compute the leftover term due to b_offset.
+            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y()
+                                                        + (id.z() % depth_input) * height_input);
+            b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_row_it, mm_result_it);
+    }
+    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    {
+        // Set window for vector_sum_col
+        Window win_vector_sum_col(collapsed_window);
+        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+        win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+        Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
+        Iterator mm_result_it(mm_result, window);
+
+        // Offset in case vector_sum_col is batched
+        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int  batch_id           = id.z() / depth_input;
+            const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+
+            // Compute the leftover term due to a_offset.
+            int32x4x4_t a_offset_term_s32 =
+            {
+                {
+                    vld1q_s32(vector_sum_col_ptr + 0),
+                    vld1q_s32(vector_sum_col_ptr + 4),
+                    vld1q_s32(vector_sum_col_ptr + 8),
+                    vld1q_s32(vector_sum_col_ptr + 12)
+                }
+            };
+
+            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+            // Store the result with the offset contribution
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 0, in_s32.val[0]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 4, in_s32.val[1]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 8, in_s32.val[2]);
+            vst1q_s32(reinterpret_cast<int32_t *>(mm_result_it.ptr()) + 12, in_s32.val[3]);
+        },
+        vector_sum_col_it, mm_result_it);
+    }
+    else // false, false
+    {
+        // No offset contribution from matrix A and matrix B
+        return;
+    }
+}
 } // namespace
 
 NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
@@ -177,193 +396,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimZ);
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = _vector_sum_row != nullptr
+                                   && _mm_result->info()->num_dimensions() > 1
+                                   && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
 
-    if(_a_offset != 0 && _b_offset != 0) // true, true
+    if(reinterpret_as_3d)
     {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        if(!_slide_vector_sum_col)
-        {
-            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-        }
-
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
-        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
-        Iterator mm_result(_mm_result, window);
-
-        const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
-
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
-            // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
-            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
-            // Add a_offset_term_s32 and b_offset_term_s32
-            int32x4x4_t offset_term_s32 =
-            {
-                {
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset),
-                    vdupq_n_s32(_k_offset)
-                }
-            };
-
-            offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32));
-            offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32));
-            offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32));
-            offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32));
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
-        },
-        vector_sum_col, vector_sum_row, mm_result);
+        run_offset_contribution<true>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
     }
-    else if((_a_offset == 0) && (_b_offset != 0)) // false, true
+    else
     {
-        // Set window for vector_sum_row
-        Window win_vector_sum_row(collapsed_window);
-        win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
-        win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-        Iterator vector_sum_row(_vector_sum_row, win_vector_sum_row);
-        Iterator mm_result(_mm_result, window);
-
-        const size_t sum_row_stride_y = _vector_sum_row->info()->strides_in_bytes().y();
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to b_offset.
-            int32x4_t b_offset_term_s32 = vld1q_dup_s32(reinterpret_cast<const int32_t *>(vector_sum_row.ptr() + id.z() * sum_row_stride_y) + id.y());
-            b_offset_term_s32           = vmulq_n_s32(b_offset_term_s32, _b_offset);
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
-        },
-        vector_sum_row, mm_result);
+        run_offset_contribution<false>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
     }
-    else if((_a_offset != 0) && (_b_offset == 0)) // true, false
-    {
-        // Set window for vector_sum_col
-        Window win_vector_sum_col(collapsed_window);
-        win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
-        if(!_slide_vector_sum_col)
-        {
-            win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
-        }
-
-        Iterator vector_sum_col(_vector_sum_col, win_vector_sum_col);
-        Iterator mm_result(_mm_result, window);
-
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Compute the leftover term due to a_offset.
-            int32x4x4_t a_offset_term_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(vector_sum_col.ptr()) + 12)
-                }
-            };
-
-            a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], _a_offset);
-            a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], _a_offset);
-            a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], _a_offset);
-            a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], _a_offset);
-
-            int32x4x4_t in_s32 =
-            {
-                {
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 0),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 4),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 8),
-                    vld1q_s32(reinterpret_cast<const int32_t *>(mm_result.ptr()) + 12)
-                }
-            };
-
-            // Add the offset terms to GEMM's result
-            in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-            in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-            in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-            in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-            // Store the result with the offset contribution
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 0, in_s32.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 4, in_s32.val[1]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 8, in_s32.val[2]);
-            vst1q_s32(reinterpret_cast<int32_t *>(mm_result.ptr()) + 12, in_s32.val[3]);
-        },
-        vector_sum_col, mm_result);
-    }
-    else // false, false
-    {
-        // No offset contribution from matrix A and matrix B
-        return;
-    }
-}
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 5e14e1a..024c4f8 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -28,10 +28,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -58,7 +60,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, input);
     }
 
     return Status{};
@@ -71,8 +73,11 @@
     // For this reason num_elems_processed_per_iteration is set to 1
     constexpr unsigned int num_elems_processed_per_iteration = 1;
 
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
+
     // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
@@ -81,10 +86,7 @@
 
     if(output->total_size() != 0)
     {
-        AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, output_result_access);
-
-        output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
     }
 
     if(bias != nullptr)
@@ -148,12 +150,11 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window win(window);
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
+    Iterator in(_input, win_collapsed);
+    Iterator out(_output, win_collapsed);
     if(_bias != nullptr)
     {
         Window win_biases;
@@ -161,7 +162,7 @@
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias(_bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win_collapsed, [&](const Coordinates & id)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
@@ -210,11 +211,11 @@
                                                                           static_cast<uint8_t>(_max));
             }
         },
-        in, bias, out);
+        in, out, bias);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates & id)
+        execute_window_loop(win_collapsed, [&](const Coordinates & id)
         {
             // Compute 16 elements per iteration
             int x = window_start_x;
@@ -256,15 +257,7 @@
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
-                                                  (bias != nullptr) ? bias->info() : nullptr,
-                                                  output->info(),
-                                                  min,
-                                                  max));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
 
     _input                        = input;
     _bias                         = bias;

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index cd6aa55..757dbbc 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp

@@ -32,15 +32,27 @@
 
 #include <arm_neon.h>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(beta);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    }
+
+    return Status{};
+}
+
 void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta)
 {
     const float32x4_t beta_f32 = vdupq_n_f32(beta);
@@ -101,12 +113,10 @@
 
 void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), beta));
 
     switch(input->info()->data_type())
     {
@@ -123,13 +133,21 @@
             break;
     }
 
+    // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
 
     _beta = beta;
 }
 
+Status NEGEMMMatrixAdditionKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, beta));
+    ARM_COMPUTE_RETURN_ON_ERROR(INESimpleKernel::validate(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration));
+    return Status{};
+}
+
 void NEGEMMMatrixAdditionKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -141,3 +159,4 @@
         (*_func)(_input, _output, window, _beta);
     }
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 0ca2474..f182fb2 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -24,7 +24,6 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 5e1c216..61221c1 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp

@@ -39,330 +39,6 @@
 
 using namespace arm_compute;
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace fp16
-{
-inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.f);
-
-    // Trace^2
-    float16x8_t trace2 = vaddq_f16(gx2, gy2);
-    trace2             = vmulq_f16(trace2, trace2);
-
-    // Det(A)
-    float16x8_t det = vmulq_f16(gx2, gy2);
-    det             = vfmsq_f16(det, gxgy, gxgy);
-
-    // Det(A) - sensitivity * trace^2
-    const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2);
-
-    // mc > strength_thresh
-    const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh));
-
-    return vbslq_f16(mask, mc, zero);
-}
-
-template <size_t block_size>
-inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy,
-                                              float norm_factor)
-{
-    const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor);
-
-    // Normalize
-    low_gx  = vmulq_f16(low_gx, norm_factor_fp16);
-    low_gy  = vmulq_f16(low_gy, norm_factor_fp16);
-    high_gx = vmulq_f16(high_gx, norm_factor_fp16);
-    high_gy = vmulq_f16(high_gy, norm_factor_fp16);
-
-    float16x8_t gx = vextq_f16(low_gx, high_gx, 0);
-    float16x8_t gy = vextq_f16(low_gy, high_gy, 0);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    gx = vextq_f16(low_gx, high_gx, 1);
-    gy = vextq_f16(low_gy, high_gy, 1);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    gx = vextq_f16(low_gx, high_gx, 2);
-    gy = vextq_f16(low_gy, high_gy, 2);
-
-    gx2  = vfmaq_f16(gx2, gx, gx);
-    gy2  = vfmaq_f16(gy2, gy, gy);
-    gxgy = vfmaq_f16(gxgy, gx, gy);
-
-    if(block_size > 3)
-    {
-        gx = vextq_f16(low_gx, high_gx, 3);
-        gy = vextq_f16(low_gy, high_gy, 3);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-
-        gx = vextq_f16(low_gx, high_gx, 4);
-        gy = vextq_f16(low_gy, high_gy, 4);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-    }
-
-    if(block_size == 7)
-    {
-        gx = vextq_f16(low_gx, high_gx, 5);
-        gy = vextq_f16(low_gy, high_gy, 5);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-
-        gx = vextq_f16(low_gx, high_gx, 6);
-        gy = vextq_f16(low_gy, high_gy, 6);
-
-        gx2  = vfmaq_f16(gx2, gx, gx);
-        gy2  = vfmaq_f16(gy2, gy, gy);
-        gxgy = vfmaq_f16(gxgy, gx, gy);
-    }
-}
-
-template <size_t block_size>
-inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                       float strength_thresh)
-{
-    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
-    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
-    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = vdupq_n_f16(0.0f);
-    float16x8_t gy2  = vdupq_n_f16(0.0f);
-    float16x8_t gxgy = vdupq_n_f16(0.0f);
-
-    for(size_t i = 0; i < block_size; ++i)
-    {
-        const float16x8_t low_gx  = vcvtq_f16_s16(vld1q_s16(gx_ptr_0));
-        const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1));
-        const float16x8_t low_gy  = vcvtq_f16_s16(vld1q_s16(gy_ptr_0));
-        const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <size_t block_size>
-inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                       float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.0f);
-
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - (block_size / 2) * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - (block_size / 2) * (in_stride + 1);
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = zero;
-    float16x8_t gy2  = zero;
-    float16x8_t gxgy = zero;
-
-    for(size_t i = 0; i < block_size; ++i)
-    {
-        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
-        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
-                                                 vget_low_f16(zero));
-        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
-        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
-                                                 vget_low_f16(zero));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<block_size>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-        gx_ptr_2 += in_stride;
-        gy_ptr_2 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-template <>
-inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity,
-                                          float strength_thresh)
-{
-    static const float16x8_t zero = vdupq_n_f16(0.0f);
-
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(in1_ptr) - 3 * (in_stride + 1);
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(in2_ptr) - 3 * (in_stride + 1);
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
-    const int32_t *gx_ptr_3 = gx_ptr_0 + 12;
-    const int32_t *gy_ptr_3 = gy_ptr_0 + 12;
-    const auto     output   = static_cast<float *__restrict>(out_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float16x8_t gx2  = zero;
-    float16x8_t gy2  = zero;
-    float16x8_t gxgy = zero;
-
-    for(size_t i = 0; i < 7; ++i)
-    {
-        const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1))));
-        const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))),
-                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3))));
-        const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))),
-                                                vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1))));
-        const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))),
-                                                 vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3))));
-        harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += in_stride;
-        gy_ptr_0 += in_stride;
-        gx_ptr_1 += in_stride;
-        gy_ptr_1 += in_stride;
-        gx_ptr_2 += in_stride;
-        gy_ptr_2 += in_stride;
-    }
-
-    // Calculate harris score
-    const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc)));
-    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc)));
-}
-
-} // namespace fp16
-
-template <int32_t block_size>
-BorderSize        NEHarrisScoreFP16Kernel<block_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <int32_t block_size>
-NEHarrisScoreFP16Kernel<block_size>::NEHarrisScoreFP16Kernel()
-    : INEHarrisScoreKernel(), _func(nullptr)
-{
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    Iterator input1(_input1, window);
-    Iterator input2(_input2, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
-    },
-    input1, input2, output);
-}
-
-template <int32_t block_size>
-void NEHarrisScoreFP16Kernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
-                                                    bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
-    _input1          = input1;
-    _input2          = input2;
-    _output          = output;
-    _sensitivity     = sensitivity;
-    _strength_thresh = strength_thresh;
-    _norm_factor     = norm_factor;
-    _border_size     = BorderSize(block_size / 2);
-
-    if(input1->info()->data_type() == DataType::S16)
-    {
-        _func = &fp16::harris_score_S16_S16_FLOAT<block_size>;
-    }
-    else
-    {
-        _func = &fp16::harris_score_S32_S32_FLOAT<block_size>;
-    }
-
-    ARM_COMPUTE_ERROR_ON(nullptr == _func);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = block_size;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-template class arm_compute::NEHarrisScoreFP16Kernel<3>;
-template class arm_compute::NEHarrisScoreFP16Kernel<5>;
-template class arm_compute::NEHarrisScoreFP16Kernel<7>;
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 template class arm_compute::NEHarrisScoreKernel<3>;
 template class arm_compute::NEHarrisScoreKernel<5>;
 template class arm_compute::NEHarrisScoreKernel<7>;

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 98b1488..2c51eae 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -41,11 +41,12 @@
 #include <tuple>
 
 using namespace arm_compute;
+using namespace misc::shape_calculator;
 
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+                          bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -55,18 +56,7 @@
 
     if(output->total_size() > 0)
     {
-        TensorShape expected_output_shape;
-
-        if(is_flatten || is_fully_connected)
-        {
-            expected_output_shape = misc::shape_calculator::compute_flatten_shape(input);
-        }
-        else
-        {
-            expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false);
-        }
-
-        TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -74,23 +64,48 @@
     return Status{};
 }
 
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                                                        bool has_bias, const Size2D &dilation)
+{
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+
+    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+                                                                             kernel_dims.width, kernel_dims.height,
+                                                                             conv_info, dilation);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)));
+
+    Window win = calculate_max_window(*input, Steps());
+    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
+    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
+    win.set(channel_idx, Window::Dimension(0, 1, 1));
+
+    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+
 template <typename T, bool has_pads>
-inline void linearize_volume(const uint8_t *const in_ptr,
-                             T                   *out_ptr,
-                             bool                 has_bias,
-                             int                  top_left_x,
-                             int                  top_left_y,
-                             int                  kernel_width,
-                             int                  kernel_height,
-                             int                  kernel_depth,
-                             int                  input_w,
-                             int                  input_h,
-                             int                  input_stride_x,
-                             int                  input_stride_y,
-                             int                  input_stride_z,
-                             int                  pad_value,
-                             int                  dilation_x,
-                             int                  dilation_y)
+inline void linearize_volume_nchw(const uint8_t *const in_ptr,
+                                  T                   *out_ptr,
+                                  bool                 has_bias,
+                                  int                  top_left_x,
+                                  int                  top_left_y,
+                                  int                  kernel_width,
+                                  int                  kernel_height,
+                                  int                  kernel_depth,
+                                  int                  input_w,
+                                  int                  input_h,
+                                  int                  input_stride_x,
+                                  int                  input_stride_y,
+                                  int                  input_stride_z,
+                                  int                  pad_value,
+                                  int                  dilation_x,
+                                  int                  dilation_y)
 {
     const int kernel_size2 = kernel_width * kernel_height;
     const int x_e          = top_left_x + kernel_width * dilation_x;
@@ -171,10 +186,63 @@
         *out_ptr = static_cast<T>(1);
     }
 }
-} // namespace
 
 template <typename T, bool has_pads>
-void NEIm2ColKernel::run_generic(const Window &window)
+inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
+                                  T                   *out_ptr,
+                                  bool                 has_bias,
+                                  int                  start_x,
+                                  int                  start_y,
+                                  int                  kernel_width,
+                                  int                  kernel_height,
+                                  int                  input_w,
+                                  int                  input_h,
+                                  int                  input_c,
+                                  int                  input_stride_y,
+                                  int                  input_stride_z,
+                                  int                  pad_value,
+                                  int                  dilation_x,
+                                  int                  dilation_y)
+{
+    const int end_x     = start_x + kernel_width * dilation_x;
+    const int end_y     = start_y + kernel_height * dilation_y;
+    const int pad_quant = kernel_width * input_c;
+
+    for(int y = start_y; y < end_y; y += dilation_y)
+    {
+        if(y < 0 || y >= input_h)
+        {
+            memset(out_ptr, pad_value, pad_quant * sizeof(T));
+            out_ptr += pad_quant;
+        }
+        else
+        {
+            for(int x = start_x; x < end_x; x += dilation_x)
+            {
+                if(x < 0 || x >= input_w)
+                {
+                    memset(out_ptr, pad_value, input_c * sizeof(T));
+                    out_ptr += input_c;
+                }
+                else
+                {
+                    memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * sizeof(T));
+                    out_ptr += input_c;
+                }
+            }
+        }
+    }
+
+    // Append 1 if the convolution layer has biases
+    if(has_bias)
+    {
+        *out_ptr = static_cast<T>(1);
+    }
+}
+} // namespace
+
+template <typename T, bool has_pads, bool is_nchw>
+void NEIm2ColKernel::run_im2col(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
@@ -184,25 +252,17 @@
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    const int kernel_depth   = _input->info()->dimension(channel_idx);
     const int input_w        = _input->info()->dimension(width_idx);
     const int input_h        = _input->info()->dimension(height_idx);
-    const int input_stride_x = _input->info()->strides_in_bytes()[width_idx];
-    const int input_stride_y = _input->info()->strides_in_bytes()[height_idx];
-    const int input_stride_z = _input->info()->strides_in_bytes()[channel_idx];
-    const int offset         = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
-
-    int pad_left = 0;
-    int pad_top  = 0;
-    int stride_x = 0;
-    int stride_y = 0;
-    pad_left     = _conv_info.pad_left();
-    pad_top      = _conv_info.pad_top();
-    std::tie(stride_x, stride_y) = _conv_info.stride();
-
-    // Setup input window
-    const int start_x = -pad_left;
-    const int start_y = -pad_top;
+    const int input_c        = _input->info()->dimension(channel_idx);
+    const int input_stride_x = _input->info()->strides_in_bytes().x();
+    const int input_stride_y = _input->info()->strides_in_bytes().y();
+    const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const int pad_left       = _conv_info.pad_left();
+    const int pad_top        = _conv_info.pad_top();
+    const int stride_x       = _conv_info.stride().first;
+    const int stride_y       = _conv_info.stride().second;
+    const int pad_value      = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
 
     Window window_in_out(window);
     // The first three dimensions of the input and output are increased by the inner loops
@@ -216,94 +276,70 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int top_left_x = id[width_idx] * stride_x + start_x;
-        const int top_left_y = id[height_idx] * stride_y + start_y;
+        const int start_w = id[width_idx] * stride_x - pad_left;
+        const int start_h = id[height_idx] * stride_y - pad_top;
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr();
         auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
 
         // Linearize volume
-        linearize_volume<T, has_pads>(input_ptr,
-                                      output_ptr,
-                                      _has_bias,
-                                      top_left_x,
-                                      top_left_y,
-                                      static_cast<int>(_kernel_width),
-                                      static_cast<int>(_kernel_height),
-                                      kernel_depth,
-                                      input_w,
-                                      input_h,
-                                      input_stride_x,
-                                      input_stride_y,
-                                      input_stride_z,
-                                      offset,
-                                      _dilation.x(),
-                                      _dilation.y());
+        if(is_nchw)
+        {
+            linearize_volume_nchw<T, has_pads>(input_ptr,
+                                               output_ptr,
+                                               _has_bias,
+                                               start_w,
+                                               start_h,
+                                               _kernel_width,
+                                               _kernel_height,
+                                               input_c,
+                                               input_w,
+                                               input_h,
+                                               input_stride_x,
+                                               input_stride_y,
+                                               input_stride_z,
+                                               pad_value,
+                                               _dilation.x(),
+                                               _dilation.y());
+        }
+        else
+        {
+            linearize_volume_nhwc<T, has_pads>(input_ptr,
+                                               output_ptr,
+                                               _has_bias,
+                                               start_w,
+                                               start_h,
+                                               _kernel_width,
+                                               _kernel_height,
+                                               input_w,
+                                               input_h,
+                                               input_c,
+                                               input_stride_y,
+                                               input_stride_z,
+                                               pad_value,
+                                               _dilation.x(),
+                                               _dilation.y());
+        }
     },
     in, out);
 }
 
-template <typename T>
-void NEIm2ColKernel::run_reduced(const Window &window)
-{
-    const size_t in_width   = _input->info()->dimension(0);
-    const size_t in_height  = _input->info()->dimension(1);
-    const size_t out_step_x = in_width * _input->info()->element_size();
-    const size_t out_step_y = out_step_x * in_height;
-    const size_t out_width  = _output->info()->dimension(0);
-
-    Window in_window(window);
-    in_window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-    out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width));
-
-    Window in_slice  = in_window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_1D();
-
-    do
-    {
-        Iterator in(_input, in_slice);
-        Iterator out(_output, out_slice);
-
-        uint8_t *out_ptr = out.ptr();
-
-        execute_window_loop(in_slice, [&](const Coordinates & id)
-        {
-            memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x);
-        },
-        in);
-
-        // Add bias
-        if(_has_bias)
-        {
-            *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
-        }
-    }
-    while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-}
-
 NEIm2ColKernel::NEIm2ColKernel()
     : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
 {
 }
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+                               bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validation step
-    ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     _input          = input;
     _output         = output;
@@ -316,33 +352,20 @@
                                         _conv_info, _dilation);
     _has_bias = has_bias;
 
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
-                               && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                              input->info()->tensor_shape().cend(),
-                                              output->info()->tensor_shape().cbegin() + 1))
-                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
-                               && ((dilation.x() == 1) && (dilation.y() == 1));
-
-    Window window = calculate_max_window(*input->info(), Steps());
-
-    if(run_img2col_reduced)
+    if(data_layout == DataLayout::NCHW)
     {
         switch(_input->info()->data_type())
         {
             case DataType::F32:
-                _func = &NEIm2ColKernel::run_reduced<float>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, true> : &NEIm2ColKernel::run_im2col<float, true, true>;
                 break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = &NEIm2ColKernel::run_reduced<float16_t>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = &NEIm2ColKernel::run_reduced<qasymm8_t>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -354,35 +377,33 @@
         switch(_input->info()->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float, false> : &NEIm2ColKernel::run_generic<float, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, false> : &NEIm2ColKernel::run_im2col<float, true, false>;
                 break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, false> : &NEIm2ColKernel::run_im2col<float16_t, true, false>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qasymm8_t, false> : &NEIm2ColKernel::run_generic<qasymm8_t, true>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
         }
-        window.set(width_idx, Window::Dimension(0, _convolved_dims.first, 1));
-        window.set(height_idx, Window::Dimension(0, _convolved_dims.second, 1));
-        window.set(channel_idx, Window::Dimension(0, 1, 1));
     }
 
-    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IKernel::configure(window);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation).first);
     return Status{};
 }
 

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 4d3ec46..46b7913 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"

diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index a6e2b00..52dbe26 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -887,9 +887,12 @@
         input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
     }
 
+    std::array<uint8_t, mask_size> vals{ {} };
+
     execute_window_loop(win, [&](const Coordinates & id)
     {
-        std::array<uint8_t, mask_size> vals{ {} };
+        // Clear array
+        std::fill(std::begin(vals), std::end(vals), 0);
 
         size_t v = 0;
         size_t m = 0;

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index fe6b69c..27af121 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -61,30 +62,40 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *input_squared, ITensorInfo *output, const NormalizationLayerInfo &norm_info)
 {
-    unsigned int       num_elems_processed_per_iteration = 16 / input->element_size();
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-    const unsigned int norm_idx                          = get_normalization_dimension_index(input->data_layout(), norm_info);
-    const unsigned int num_rows                          = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1;
-    const unsigned int border_width                      = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
-    BorderSize         border_size                       = BorderSize(0, border_width);
-    bool               window_changed                    = false;
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+
+    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
+    const bool         is_norm_accross_width = norm_idx == 0;
+
+    const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
+    const BorderSize   border_size  = BorderSize(0, border_width);
 
     // Configure window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
 
-    AccessWindowRectangle input_access(input, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
-    AccessWindowRectangle input_squared_access(input_squared, -border_size.left, 0, num_elems_read_per_iteration, num_rows);
+    if(is_norm_accross_width)
+    {
+        AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+        AccessWindowStatic input_squared_access(input_squared, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
+        window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
+    }
+    else
+    {
+        AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
+        AccessWindowHorizontal input_squared_access(input_squared, -border_size.left, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
+    }
 
     if(output->total_size() != 0)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, input_squared_access, output_access);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
         output_access.set_valid_region(win, input->valid_region());
     }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, input_squared_access);
-    }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -110,8 +121,11 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
 
-    const unsigned int norm_idx     = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
-    const unsigned int border_width = (norm_idx == 2) ? 0 : std::min<unsigned int>(norm_info.norm_size() / 2, 3U);
+    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+
+    const unsigned int norm_idx              = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
+    const bool         is_norm_accross_width = norm_idx == 0;
+    const unsigned int border_width          = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
 
     _input         = input;
     _input_squared = input_squared;
@@ -190,12 +204,10 @@
 
     const int dim_y                = 1;
     const int radius               = _norm_info.norm_size() / 2;
-    const int total_size           = _input->info()->dimension(dim) - 1;
     const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
     // We account padding across X only and we iterate over rows
     const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
-    const int max_right  = (dim == 2) ? total_size : total_size + border_size().left;
-    const int min_top    = 0;
+    const int max_right  = _input->info()->dimension(dim) - 1;
     const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
     if(dt == DataType::F32)
@@ -209,7 +221,7 @@
             // Get range to normalize
             const int current_row   = do_2D_norm ? id[dim_y] : 0;
             const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
             const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
             const int first_slice   = std::max(current_slice - radius, min_left);
             const int last_slice    = std::min(current_slice + radius, max_right);
@@ -246,7 +258,7 @@
             // Get range to normalize
             const int current_row   = do_2D_norm ? id[dim_y] : 0;
             const int current_slice = id[dim];
-            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
             const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
             const int first_slice   = std::max(current_slice - radius, min_left);
             const int last_slice    = std::min(current_slice + radius, max_right);

diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 8d3fd88..29e6d50 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp

@@ -50,7 +50,8 @@
                                                          DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
+                                    && (perm != PermutationVector{ 1U, 2U, 0U }),
                                     "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
 
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
@@ -89,7 +90,7 @@
     Iterator out(_output, window_out);
 
     // CHW -> HWC
-    if((_perm.num_dimensions() == 3) && (_perm[0] == 2) && (_perm[1] == 0) && (_perm[2] == 1))
+    if(_perm == PermutationVector{ 2U, 0U, 1U })
     {
         const int in_row_stride     = _input->info()->strides_in_bytes().y() / sizeof(T);
         const int in_channel_stride = _input->info()->strides_in_bytes().z() / sizeof(T);
@@ -116,7 +117,7 @@
         in, out);
     }
     // HWC -> CHW
-    else if((_perm.num_dimensions() == 3) && (_perm[0] == 1) && (_perm[1] == 2) && (_perm[2] == 0))
+    else if(_perm == PermutationVector{ 1U, 2U, 0U })
     {
         const int in_col_stride   = _input->info()->strides_in_bytes().y() / sizeof(T);
         const int in_row_stride   = _input->info()->strides_in_bytes().z() / sizeof(T);

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index ad4b8f7..310560b 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -35,6 +35,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/ToolchainSupport.h"
 
@@ -47,18 +48,10 @@
 #include <tuple>
 
 using namespace arm_compute;
+using namespace misc::shape_calculator;
 
 namespace
 {
-void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH), pooled_w);
-    output_shape.set(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT), pooled_h);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-}
-
 template <bool exclude_padding, DataLayout data_layout>
 inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
@@ -166,7 +159,9 @@
                                                         BorderSize &border_size,
                                                         unsigned int pooled_w, unsigned int pooled_h, int pool_size_x, int pool_size_y)
 {
-    // Get data layout
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_pool_shape(*input, pool_info)));
+
     DataLayout          data_layout                  = input->data_layout();
     unsigned int        num_elems_read_per_iteration = 0;
     unsigned int        num_elems_horizontal_window  = 0;
@@ -190,7 +185,6 @@
                                                      pool_size_x,
                                                      pool_size_y,
                                                      pad_stride_info);
-    auto_init(input, output, pooled_w, pooled_h);
 
     //If it's not squared and optimized will be executed the MxN
     num_elems_read_per_iteration      = 1;
@@ -206,7 +200,7 @@
             case DataType::QASYMM8:
                 if(is_nhwc)
                 {
-                    num_elems_processed_per_iteration = 8;
+                    num_elems_processed_per_iteration = 16;
                     break;
                 }
                 switch(pool_size_x)
@@ -277,8 +271,7 @@
     {
         if(is_nhwc)
         {
-            const unsigned int vector_size    = 16 / input->element_size();
-            num_elems_processed_per_iteration = (input->data_type() == DataType::QASYMM8) ? 8 : vector_size;
+            num_elems_processed_per_iteration = 16 / input->element_size();
         }
     }
 
@@ -371,9 +364,6 @@
                                                      pool_size_y,
                                                      pad_stride_info);
 
-    // Output auto initialization if not yet initialized
-    auto_init(input->info(), output->info(), pooled_w, pooled_h);
-
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
 
@@ -1561,8 +1551,16 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int idx_width  = id.y() * pool_stride_x;
-        const int idx_height = id.z() * pool_stride_y;
+        const int idx_width    = id.y() * pool_stride_x;
+        const int idx_height   = id.z() * pool_stride_y;
+        const int pool_limit_y = pool_pad_top - idx_height;
+        const int pool_limit_x = pool_pad_left - idx_width;
+
+        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+        const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+        const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
@@ -1572,21 +1570,10 @@
 
             // Perform pooling
             vres = vdupq_n_f16(0.0f);
-
-            for(int y = 0; y < pool_size_y; ++y)
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
-
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
                     const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
                                                                                            (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
 
@@ -1607,20 +1594,11 @@
         else
         {
             vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-            for(int y = 0; y < pool_size_y; ++y)
+
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height > window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
-
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width > window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
                     const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
                                                                                            (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
                     vres                   = vmaxq_f16(vres, data);
@@ -1792,8 +1770,16 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int idx_width  = id.y() * pool_stride_x;
-        const int idx_height = id.z() * pool_stride_y;
+        const int idx_width    = id.y() * pool_stride_x;
+        const int idx_height   = id.z() * pool_stride_y;
+        const int pool_limit_y = pool_pad_top - idx_height;
+        const int pool_limit_x = pool_pad_left - idx_width;
+
+        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+        const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+        const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
@@ -1804,20 +1790,10 @@
             // Perform pooling
             vres = vdupq_n_f32(0.0f);
 
-            for(int y = 0; y < pool_size_y; ++y)
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
-
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
                     const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
                                                                                        (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
 
@@ -1838,20 +1814,10 @@
         else
         {
             vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
-            for(int y = 0; y < pool_size_y; ++y)
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
-
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
                     const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
                                                                                        (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
                     vres                   = vmaxq_f32(vres, data);
@@ -1862,8 +1828,7 @@
         // Calculate square-root in case of l2 pooling
         if(pooling_type == PoolingType::L2)
         {
-            float32x4_t sqrt_reciprocal = vrsqrteq_f32(vres);
-            vres                        = vmulq_f32(vres, vmulq_f32(vrsqrtsq_f32(vmulq_f32(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
+            vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
         }
 
         // Store result
@@ -1986,14 +1951,26 @@
     const int upper_bound_w = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = _input->info()->dimension(2) + (exclude_padding ? 0 : pool_pad_bottom);
 
+    const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
+
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int idx_width  = id.y() * pool_stride_x;
-        const int idx_height = id.z() * pool_stride_y;
+        const int idx_width    = id.y() * pool_stride_x;
+        const int idx_height   = id.z() * pool_stride_y;
+        const int pool_limit_y = pool_pad_top - idx_height;
+        const int pool_limit_x = pool_pad_left - idx_width;
+
+        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+        const int pool_end_y   = std::min(pool_size_y, window_input.z().end() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+        const int pool_end_x   = std::min(pool_size_x, window_input.y().end() + pool_limit_x);
+
         if(pooling_type != PoolingType::MAX)
         {
             uint32x4_t vres1 = vdupq_n_u32(0);
             uint32x4_t vres2 = vdupq_n_u32(0);
+            uint32x4_t vres3 = vdupq_n_u32(0);
+            uint32x4_t vres4 = vdupq_n_u32(0);
 
             // Calculate scale
             const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
@@ -2001,63 +1978,50 @@
             const float32x4_t scale_v = vdupq_n_f32(scale);
 
             // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
+                    const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
 
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
-                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
-                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
-
-                    const uint16x8_t data_u16 = vmovl_u8(data);
-                    vres1                     = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
-                    vres2                     = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+                    const uint16x8_t data_u16  = vmovl_u8(vget_low_u8(data));
+                    const uint16x8_t data2_u16 = vmovl_u8(vget_high_u8(data));
+                    vres1                      = vaddq_u32(vres1, vmovl_u16(vget_low_u16(data_u16)));
+                    vres2                      = vaddq_u32(vres2, vmovl_u16(vget_high_u16(data_u16)));
+                    vres3                      = vaddq_u32(vres3, vmovl_u16(vget_low_u16(data2_u16)));
+                    vres4                      = vaddq_u32(vres4, vmovl_u16(vget_high_u16(data2_u16)));
                 }
             }
-            // Divide by scale
-            vres1 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres1), scale_v));
-            vres2 = vcvtq_u32_f32(vmulq_f32(vcvtq_f32_u32(vres2), scale_v));
+            // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+            vres1 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres1), scale_v));
+            vres2 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres2), scale_v));
+            vres3 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres3), scale_v));
+            vres4 = vcvtq_u32_f32(vmlaq_f32(half_scale_v, vcvtq_f32_u32(vres4), scale_v));
 
-            uint8x8_t res = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+            uint8x8_t res1 = vmovn_u16(vcombine_u16(vmovn_u32(vres1), vmovn_u32(vres2)));
+            uint8x8_t res2 = vmovn_u16(vcombine_u16(vmovn_u32(vres3), vmovn_u32(vres4)));
 
             // Store result
-            vst1_u8(output.ptr(), res);
+            vst1_u8(output.ptr(), res1);
+            vst1_u8(output.ptr() + 8, res2);
         }
         else
         {
-            uint8x8_t vres = vdup_n_u8(0);
+            uint8x16_t vres = vdupq_n_u8(0);
 
-            for(int y = 0; y < pool_size_y; ++y)
+            for(int y = pool_start_y; y < pool_end_y; ++y)
             {
-                if(y + idx_height - pool_pad_top >= window_input.z().end() || y + idx_height - pool_pad_top < window_input.z().start())
+                for(int x = pool_start_x; x < pool_end_x; ++x)
                 {
-                    continue;
-                }
-
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    if(x + idx_width - pool_pad_left >= window_input.y().end() || x + idx_width - pool_pad_left < window_input.y().start())
-                    {
-                        continue;
-                    }
-
-                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
-                                                                                     (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
-                    vres                 = vmax_u8(vres, data);
+                    const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().y() +
+                                                                                       (y - pool_pad_top) * _input->info()->strides_in_bytes().z()));
+                    vres                  = vmaxq_u8(vres, data);
                 }
             }
 
             // Store result
-            vst1_u8(output.ptr(), vres);
+            vst1q_u8(output.ptr(), vres);
         }
     },
     input, output);

diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
new file mode 100644
index 0000000..2f63179
--- /dev/null
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp

@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, input2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+
+    // Check variances
+    const int var_size = info.variances().size();
+    if(var_size > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
+        for(int i = 0; i < var_size; ++i)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
+        }
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
+
+    if(!info.max_sizes().empty())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+    }
+
+    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+    }
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(get_data_layout_dimension_index(input1->data_layout(), DataLayoutDimension::HEIGHT)) != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input1, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_UNUSED(input2);
+
+    Window win            = {};
+    bool   window_changed = false;
+    switch(input1->data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            const int          num_priors                        = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+            const unsigned int num_elems_processed_per_iteration = 4 * num_priors;
+            win                                                  = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+            window_changed = update_window_and_padding(win, output_access);
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            win = calculate_max_window(*output, Steps());
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    };
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
+    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+{
+}
+
+template <DataLayout DL>
+void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
+                                              const int height)
+{
+    float xmin = (center_x - box_width / 2.f) / width;
+    float ymin = (center_y - box_height / 2.f) / height;
+    float xmax = (center_x + box_width / 2.f) / width;
+    float ymax = (center_y + box_height / 2.f) / height;
+
+    switch(DL)
+    {
+        case DataLayout::NCHW:
+        {
+            float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
+            if(_info.clip())
+            {
+                static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+                static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+                vec_elements                     = vmaxq_f32(vminq_f32(vec_elements, CONST_1), CONST_0);
+            }
+            vst1q_f32(out + offset, vec_elements);
+        }
+        break;
+        case DataLayout::NHWC:
+        {
+            const int output_offset = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();
+            if(_info.clip())
+            {
+                xmin = std::min(std::max(xmin, 0.f), 1.f);
+                ymin = std::min(std::max(ymin, 0.f), 1.f);
+                xmax = std::min(std::max(xmax, 0.f), 1.f);
+                ymax = std::min(std::max(ymax, 0.f), 1.f);
+            }
+
+            *(out + output_offset * offset)       = xmin;
+            *(out + output_offset * (offset + 1)) = ymin;
+            *(out + output_offset * (offset + 2)) = xmax;
+            *(out + output_offset * (offset + 3)) = ymax;
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <DataLayout DL>
+void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
+{
+    const int num_priors = _info.aspect_ratios().size() * _info.min_sizes().size() + _info.max_sizes().size();
+
+    const int width_idx  = get_data_layout_dimension_index(DL, DataLayoutDimension::WIDTH);
+    const int height_idx = get_data_layout_dimension_index(DL, DataLayoutDimension::HEIGHT);
+
+    const int layer_width  = _input1->info()->dimension(width_idx);
+    const int layer_height = _input1->info()->dimension(height_idx);
+
+    int img_width  = _info.img_size().x;
+    int img_height = _info.img_size().y;
+    if(img_width == 0 || img_height == 0)
+    {
+        img_width  = _input2->info()->dimension(width_idx);
+        img_height = _input2->info()->dimension(height_idx);
+    }
+
+    float step_x = _info.steps()[0];
+    float step_y = _info.steps()[1];
+    if(step_x == 0.f || step_y == 0.f)
+    {
+        step_x = static_cast<float>(img_width) / layer_width;
+        step_y = static_cast<float>(img_height) / layer_height;
+    }
+
+    Window slice = {};
+
+    switch(DL)
+    {
+        case DataLayout::NCHW:
+            slice = window.first_slice_window_2D();
+            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
+            break;
+        case DataLayout::NHWC:
+            slice = window.first_slice_window_3D();
+            slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 4 * num_priors));
+            slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), 2));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    Iterator output(_output, slice);
+    execute_window_loop(slice, [&](const Coordinates & id)
+    {
+        float center_x = 0;
+        float center_y = 0;
+        int   idx      = 0;
+        switch(DL)
+        {
+            case DataLayout::NCHW:
+                idx      = id.x() / (4 * num_priors);
+                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+                break;
+            case DataLayout::NHWC:
+                idx      = id.y() / (4 * num_priors);
+                center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+                center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+
+        float box_width;
+        float box_height;
+        int   offset = 0;
+
+        auto out = reinterpret_cast<float *>(output.ptr());
+        for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+        {
+            const float min_size = _info.min_sizes().at(i);
+            box_width            = min_size;
+            box_height           = min_size;
+            store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+            offset += 4;
+
+            if(!_info.max_sizes().empty())
+            {
+                const float max_size = _info.max_sizes().at(i);
+                box_width            = std::sqrt(min_size * max_size);
+                box_height           = box_width;
+
+                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                offset += 4;
+            }
+
+            // rest of priors
+            for(auto ar : _info.aspect_ratios())
+            {
+                if(fabs(ar - 1.) < 1e-6)
+                {
+                    continue;
+                }
+
+                box_width  = min_size * sqrt(ar);
+                box_height = min_size / sqrt(ar);
+
+                store_coordinates<DL>(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                offset += 4;
+            }
+        }
+
+        // set the variance
+        switch(DL)
+        {
+            case DataLayout::NCHW:
+            {
+                out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+                float32x4_t var;
+                if(_info.variances().size() == 1)
+                {
+                    var = vdupq_n_f32(_info.variances().at(0));
+                }
+                else
+                {
+                    const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
+                    var                    = vars;
+                }
+                for(int i = 0; i < num_priors; ++i)
+                {
+                    vst1q_f32(out + 4 * i, var);
+                }
+            }
+            break;
+            case DataLayout::NHWC:
+            {
+                for(int i = 0; i < num_priors; ++i)
+                {
+                    const int  prior_offset = 4 * i;
+                    const bool single_var   = _info.variances().size() == 1;
+                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 0, 1)))) = _info.variances().at(0);
+                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 1, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(1);
+                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 2, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(2);
+                    *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(0, id.y() + prior_offset + 3, 1)))) = single_var ? _info.variances().at(0) : _info.variances().at(3);
+                }
+            }
+            break;
+            default:
+                ARM_COMPUTE_ERROR("Not implemented");
+        }
+
+    },
+    output);
+}
+
+void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), info));
+
+    _input1 = input1;
+    _input2 = input2;
+    _info   = info;
+    _output = output;
+
+    switch(input1->info()->data_layout())
+    {
+        case DataLayout::NCHW:
+        {
+            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NCHW>;
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            _func = &NEPriorBoxLayerKernel::calculate_prior_boxes<DataLayout::NHWC>;
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented.");
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info(), info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info)
+                                .first);
+
+    return Status{};
+}
+void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    // Run function
+    (this->*_func)(window);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 30f21bb..182e93d 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -32,10 +32,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 template <class F>
@@ -57,31 +58,284 @@
             Iterator in(input, in_slice);
             Iterator out(output, out_slice);
 
-            f(in, out, in_slice, out_slice);
+            f(in, out, in_slice, out_slice, *input->info());
         }
-        while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+        while(window.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+    }
+    static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f)
+    {
+        // Set in window
+        Window in_window(window);
+
+        in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        // Get first input and output slices
+        Window in_slice  = in_window.first_slice_window_2D();
+        Window out_slice = window.first_slice_window_2D();
+
+        do
+        {
+            Iterator in(input, in_slice);
+            Iterator out(output, out_slice);
+
+            f(in, out, in_slice, out_slice, *input->info(), 1);
+        }
+        while(in_window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+    }
+    static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f)
+    {
+        // Set in window
+        Window in_window(window);
+
+        in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+        // Get first input and output slices
+        Window in_slice  = in_window.first_slice_window_3D();
+        Window out_slice = window.first_slice_window_3D();
+
+        do
+        {
+            Iterator in(input, in_slice);
+            Iterator out(output, out_slice);
+
+            f(in, out, in_slice, out_slice, *input->info(), 2);
+        }
+        while(in_window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+    }
+    static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f)
+    {
+        // Set in/out window
+        Window in_window(window);
+        Window out_window(window);
+
+        in_window.set(3, Window::Dimension(0, 1, 1));
+        out_window.set(3, Window::Dimension(0, 1, 1));
+
+        // Get first input and output slices
+        Window in_slice  = in_window.first_slice_window_4D();
+        Window out_slice = out_window.first_slice_window_4D();
+
+        do
+        {
+            Iterator in(input, in_slice);
+            Iterator out(output, out_slice);
+
+            f(in, out, in_slice, out_slice, *input->info(), 3);
+        }
+        while(in_window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_4D(out_slice));
     }
 };
 
-struct SumsqOpX
+template <typename T, int S, ReductionOperation op>
+struct RedOpX
 {
-    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice)
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
     {
         ARM_COMPUTE_UNUSED(out_slice);
-        float32x4_t vec_sum_value = vdupq_n_f32(0.f);
+        auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
         execute_window_loop(in_slice, [&](const Coordinates & id)
         {
-            const auto        in_ptr       = reinterpret_cast<const float *>(input.ptr());
-            const float32x4_t vec_elements = vld1q_f32(in_ptr);
-            vec_sum_value                  = vaddq_f32(vmulq_f32(vec_elements, vec_elements), vec_sum_value);
+            const auto in_ptr       = reinterpret_cast<const T *>(input.ptr());
+            const auto vec_elements = wrapper::vloadq(in_ptr);
+
+            if(op == ReductionOperation::SUM_SQUARE)
+            {
+                vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
+            }
+            else
+            {
+                vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+            }
         },
         input);
 
-        float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value));
-        carry_addition             = vpadd_f32(carry_addition, carry_addition);
+        auto carry_addition = wrapper::vpadd(wrapper::vgethigh(vec_sum_value), wrapper::vgetlow(vec_sum_value));
+        for(int i = 0; i < S / 4; ++i)
+        {
+            carry_addition = wrapper::vpadd(carry_addition, carry_addition);
+        }
 
-        *(reinterpret_cast<float *>(output.ptr())) = vget_lane_f32(carry_addition, 0);
+        auto res = wrapper::vgetlane(carry_addition, 0);
+        if(op == ReductionOperation::MEAN_SUM)
+        {
+            res /= in_info.dimension(0);
+        }
+
+        *(reinterpret_cast<T *>(output.ptr())) = res;
+    }
+};
+
+template <ReductionOperation op>
+struct RedOpX_qasymm8
+{
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info)
+    {
+        ARM_COMPUTE_UNUSED(out_slice);
+        auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+        auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            const auto vec_elements = wrapper::vloadq(input.ptr());
+
+            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+            vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
+            vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
+            vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
+            vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+        },
+        input);
+
+        auto carry_addition = wrapper::vadd(vec_sum_value1, vec_sum_value2);
+        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value3);
+        carry_addition      = wrapper::vadd(carry_addition, vec_sum_value4);
+
+        auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_addition), wrapper::vgetlow(carry_addition));
+        carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
+        auto res             = wrapper::vgetlane(carry_paddition, 0);
+
+        if(op == ReductionOperation::MEAN_SUM)
+        {
+            res /= in_info.dimension(0);
+        }
+
+        *(output.ptr()) = static_cast<uint8_t>(res);
+    }
+};
+
+template <typename T, int S, ReductionOperation op>
+struct RedOpYZW
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    {
+        ARM_COMPUTE_UNUSED(out_slice);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            auto vec_sum_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+            {
+                T *in_ptr;
+                switch(axis)
+                {
+                    case 1:
+                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
+                        break;
+                    case 2:
+                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
+                        break;
+                    case 3:
+                        in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+                const auto vec_elements = wrapper::vloadq(in_ptr);
+
+                if(op == ReductionOperation::SUM_SQUARE)
+                {
+                    vec_sum_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_sum_value);
+                }
+                else
+                {
+                    vec_sum_value = wrapper::vadd(vec_elements, vec_sum_value);
+                }
+            }
+
+            if(op == ReductionOperation::MEAN_SUM)
+            {
+                auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                vec_sum_value      = wrapper::vmul(vec_sum_value, vec_width_inv);
+            }
+
+            wrapper::vstore(reinterpret_cast<T *>(output.ptr()), vec_sum_value);
+        },
+        input, output);
+    }
+};
+
+template <ReductionOperation op>
+struct RedOpYZW_qasymm8
+{
+    inline void operator()(Iterator &input, Iterator &output, Window &in_slice, Window &out_slice, const TensorInfo &in_info, int axis)
+    {
+        ARM_COMPUTE_UNUSED(out_slice);
+
+        execute_window_loop(in_slice, [&](const Coordinates & id)
+        {
+            auto vec_sum_value1 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            auto vec_sum_value2 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            auto vec_sum_value3 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            auto vec_sum_value4 = vdupq_n_u32(static_cast<uint32_t>(0.f));
+            for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+            {
+                uint8_t *in_ptr;
+                switch(axis)
+                {
+                    case 1:
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim));
+                        break;
+                    case 2:
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim));
+                        break;
+                    case 3:
+                        in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Not supported");
+                }
+                const auto vec_elements = wrapper::vloadq(in_ptr);
+
+                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                vec_sum_value1 = wrapper::vadd(temp32x4t_1, vec_sum_value1);
+                vec_sum_value2 = wrapper::vadd(temp32x4t_2, vec_sum_value2);
+                vec_sum_value3 = wrapper::vadd(temp32x4t_3, vec_sum_value3);
+                vec_sum_value4 = wrapper::vadd(temp32x4t_4, vec_sum_value4);
+            }
+
+            if(op == ReductionOperation::MEAN_SUM)
+            {
+                const auto vec_width_inv    = wrapper::vinv(vdupq_n_f32(in_info.dimension(axis)));
+                const auto vec_sum_value1_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value1), vec_width_inv);
+                const auto vec_sum_value2_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value2), vec_width_inv);
+                const auto vec_sum_value3_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value3), vec_width_inv);
+                const auto vec_sum_value4_f = wrapper::vmul(vcvtq_f32_u32(vec_sum_value4), vec_width_inv);
+
+                vec_sum_value1 = vcvtq_u32_f32(vec_sum_value1_f);
+                vec_sum_value2 = vcvtq_u32_f32(vec_sum_value2_f);
+                vec_sum_value3 = vcvtq_u32_f32(vec_sum_value3_f);
+                vec_sum_value4 = vcvtq_u32_f32(vec_sum_value4_f);
+            }
+
+            const auto temp16x8t_1 = vcombine_u16(wrapper::vqmovn(vec_sum_value1), wrapper::vqmovn(vec_sum_value2));
+            const auto temp16x8t_2 = vcombine_u16(wrapper::vqmovn(vec_sum_value3), wrapper::vqmovn(vec_sum_value4));
+            auto       res         = vcombine_u8(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+            wrapper::vstore(output.ptr(), res);
+        },
+        input, output);
     }
 };
 
@@ -90,7 +344,186 @@
     switch(axis)
     {
         case 0:
-            return Reducer<SumsqOpX>::reduceX(window, input, output, SumsqOpX());
+            switch(input->info()->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM_SQUARE>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM_SQUARE>());
+                case DataType::QASYMM8:
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 1:
+            switch(input->info()->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+                case DataType::QASYMM8:
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 2:
+            switch(input->info()->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+                case DataType::QASYMM8:
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 3:
+            switch(input->info()->data_type())
+            {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM_SQUARE>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM_SQUARE>());
+                case DataType::QASYMM8:
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+    }
+}
+
+void reduce_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+    switch(axis)
+    {
+        case 0:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpX_qasymm8<ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpX<float, 4, ReductionOperation::SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 1:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 2:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 3:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported reduction axis");
+    }
+}
+void reduce_mean_sum(const Window &window, const ITensor *input, ITensor *output, unsigned int axis)
+{
+    switch(axis)
+    {
+        case 0:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpX_qasymm8<ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpX<float, 4, ReductionOperation::MEAN_SUM>>::reduceX(window, input, output, RedOpX<float, 4, ReductionOperation::MEAN_SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 1:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceY(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 2:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceZ(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+        case 3:
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    return Reducer<RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW_qasymm8<ReductionOperation::MEAN_SUM>());
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    return Reducer<RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float16_t, 8, ReductionOperation::MEAN_SUM>());
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F32:
+                    return Reducer<RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>>::reduceW(window, input, output, RedOpYZW<float, 4, ReductionOperation::MEAN_SUM>());
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
@@ -109,16 +542,15 @@
     ARM_COMPUTE_UNUSED(op);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
 
         const TensorShape output_shape         = calculate_output_shape(input->tensor_shape(), axis);
         const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
@@ -170,10 +602,11 @@
 
     unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type());
 
-    _input       = input;
-    _output      = output;
-    _border_size = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
-    _op          = op;
+    _input          = input;
+    _output         = output;
+    _border_size    = (axis == 0) ? BorderSize(0, num_elems_processed_per_iteration - (input->info()->dimension(0) % num_elems_processed_per_iteration), 0, 0) : BorderSize();
+    _op             = op;
+    _reduction_axis = axis;
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis);
@@ -202,7 +635,14 @@
         case ReductionOperation::SUM_SQUARE:
             reduce_sumsq(window, _input, _output, _reduction_axis);
             break;
+        case ReductionOperation::MEAN_SUM:
+            reduce_mean_sum(window, _input, _output, _reduction_axis);
+            break;
+        case ReductionOperation::SUM:
+            reduce_sum(window, _input, _output, _reduction_axis);
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction operation.");
     }
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 66115bb..edb3ffe 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -113,8 +113,8 @@
     AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
 
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapx_access(map_x->info(), 0, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapy_access(map_y->info(), 0, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapx_access(map_x->info(),    0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mapy_access(map_y->info(),    0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
 

diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
new file mode 100644
index 0000000..8baea2b
--- /dev/null
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp

@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+
+    // Validate output if initialized
+    if(output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+template <typename T>
+void NEReorgLayerKernel::run_reorg(const Window &window)
+{
+    const DataLayout data_layout = _input->info()->data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t     idx_c       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const unsigned int stride = _stride;
+    const unsigned int out_c  = _output->info()->tensor_shape()[idx_c] / (stride * stride);
+    const uint8_t     *in_ptr = _input->buffer();
+
+    // Collapse
+    Window collapsed_window = window.collapse_if_possible(window, 4);
+
+    // Create Iterator
+    Iterator out(_output, collapsed_window);
+
+    // Perform reorg
+    execute_window_loop(collapsed_window, [&](const Coordinates & id)
+    {
+        // Get spatial coords and channels
+        const unsigned int w = id[idx_w];
+        const unsigned int h = id[idx_h];
+        const unsigned int c = id[idx_c];
+
+        // Calculate mapping
+        const unsigned int offset     = c / out_c;
+        Coordinates        map_coords = id;
+        map_coords.set(idx_w, w * stride + offset % stride);
+        map_coords.set(idx_h, h * stride + offset / stride);
+        map_coords.set(idx_c, c % out_c);
+
+        // Perform mapping
+        *(reinterpret_cast<T *>(out.ptr())) = *(reinterpret_cast<const T *>(in_ptr + _input->info()->offset_element_in_bytes(map_coords)));
+    },
+    out);
+}
+
+NEReorgLayerKernel::NEReorgLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _stride(1)
+{
+}
+
+void NEReorgLayerKernel::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
+
+    _func   = nullptr;
+    _input  = input;
+    _output = output;
+    _stride = stride;
+
+    switch(input->info()->element_size())
+    {
+        case 1:
+            _func = &NEReorgLayerKernel::run_reorg<uint8_t>;
+            break;
+        case 2:
+            _func = &NEReorgLayerKernel::run_reorg<uint16_t>;
+            break;
+        case 4:
+            _func = &NEReorgLayerKernel::run_reorg<uint32_t>;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // The NEReorgLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps());
+
+    ICPPKernel::configure(win);
+}
+
+Status NEReorgLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
+    return Status{};
+}
+
+void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
+
+    if(_func != nullptr)
+    {
+        (this->*_func)(window);
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 8043e8b..c718991 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -35,10 +35,23 @@
 
 #include <cstdint>
 
+/** [NEReshapeLayerKernel Kernel] **/
 using namespace arm_compute;
 
 namespace
 {
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+                                                         DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != output->tensor_shape().total_size());
+
+    return Status{};
+}
+
 template <typename T>
 inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *output)
 {
@@ -59,29 +72,28 @@
 
 void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
-                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input->info());
 
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->tensor_shape().x(), output->info()->tensor_shape().y());
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    // Set the output valid region
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
+Status NEReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+
+    return Status{};
+}
+
 void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -109,3 +121,4 @@
             ARM_COMPUTE_ERROR("Unsupported data type!");
     }
 }
+/** [NEReshapeLayerKernel Kernel] **/

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 3d19c1d..0f416de 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -282,6 +282,7 @@
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+// TODO (COMPMID-1535) : Revisit FP16 approximations
 float16x8_t vexp(const float16x8_t &vec)
 {
     float16x4x2_t res =

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 7ac6cdb..870d2c9 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -101,13 +102,12 @@
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic input_access(input, 0, 0, input->dimension(0), input->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input_access);
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    bool                  window_changed = update_window_and_padding(win, input_access);
 
     if(output->total_size() != 0)
     {
-        AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration_y, num_elems_processed_per_iteration_x);
 
         window_changed = window_changed || update_window_and_padding(win, output_access);
 

diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
new file mode 100644
index 0000000..5dca58e
--- /dev/null
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp

@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+    const int              num_elems_processed_per_iteration_x_out = num_elems_processed_per_iteration_x * info.x();
+    Window                 win                                     = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x_out));
+    AccessWindowRectangle  input_access(input, 0, 0, num_elems_processed_per_iteration_x, 1, 0.5f, 0.5f);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x_out);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, output->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration_x);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+    output_access.set_valid_region(win, output->valid_region());
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
+{
+    std::pair<Status, Window> win_config;
+    switch(input->data_layout())
+    {
+        case DataLayout::NCHW:
+            win_config = validate_and_configure_window_nchw(input, output, num_elems_processed_per_iteration_x, info);
+            break;
+        case DataLayout::NHWC:
+            win_config = validate_and_configure_window_nhwc(input, output, num_elems_processed_per_iteration_x, info);
+            break;
+        default:
+            win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
+    }
+
+    return win_config;
+}
+} // namespace
+NEUpsampleLayerKernel::NEUpsampleLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_x()
+{
+}
+
+Status NEUpsampleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(policy);
+
+    const DataLayout data_layout = input->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
+
+    // Check output if configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+    }
+
+    const int num_elems_processed_per_iteration_x = 16 / input->element_size();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              output->clone().get(), num_elems_processed_per_iteration_x, info)
+                                .first);
+    return Status{};
+}
+
+void NEUpsampleLayerKernel::upsample_f32_nchw(const arm_compute::Window &window)
+{
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+    Iterator  input(_input, window_in);
+    Iterator  output(_output, window_out);
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
+        const float32x4_t data_out1 = { vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 0), vgetq_lane_f32(data, 1), vgetq_lane_f32(data, 1) };
+        const float32x4_t data_out2 = { vgetq_lane_f32(data, 2), vgetq_lane_f32(data, 2), vgetq_lane_f32(data, 3), vgetq_lane_f32(data, 3) };
+        auto              out       = reinterpret_cast<float *>(output.ptr());
+
+        vst1q_f32(out, data_out1);
+        vst1q_f32(out + 4, data_out2);
+        vst1q_f32(out + offset_y_out, data_out1);
+        vst1q_f32(out + offset_y_out + 4, data_out2);
+    },
+    input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_f32_nhwc(const arm_compute::Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+    window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+    Iterator input(_input, window);
+    Iterator output(_output, window_out);
+
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float);
+    const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(input.ptr()));
+        auto              out  = reinterpret_cast<float *>(output.ptr());
+
+        vst1q_f32(out, data);
+        vst1q_f32(out + offset_y_out, data);
+        vst1q_f32(out + offset_z_out, data);
+        vst1q_f32(out + offset_y_out + offset_z_out, data);
+    },
+    input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_qasymm8_nchw(const arm_compute::Window &window)
+{
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+    Iterator  input(_input, window_in);
+    Iterator  output(_output, window_out);
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const uint8x16_t data      = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
+        const uint8x16_t data_out1 = { vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 0), vgetq_lane_u8(data, 1), vgetq_lane_u8(data, 1),
+                                       vgetq_lane_u8(data, 2), vgetq_lane_u8(data, 2), vgetq_lane_u8(data, 3), vgetq_lane_u8(data, 3),
+                                       vgetq_lane_u8(data, 4), vgetq_lane_u8(data, 4), vgetq_lane_u8(data, 5), vgetq_lane_u8(data, 5),
+                                       vgetq_lane_u8(data, 6), vgetq_lane_u8(data, 6), vgetq_lane_u8(data, 7), vgetq_lane_u8(data, 7)
+                                     };
+        const uint8x16_t data_out2 =
+        {
+            vgetq_lane_u8(data, 8), vgetq_lane_u8(data, 8), vgetq_lane_u8(data, 9), vgetq_lane_u8(data, 9),
+            vgetq_lane_u8(data, 10), vgetq_lane_u8(data, 10), vgetq_lane_u8(data, 11), vgetq_lane_u8(data, 11),
+            vgetq_lane_u8(data, 12), vgetq_lane_u8(data, 12), vgetq_lane_u8(data, 13), vgetq_lane_u8(data, 13),
+            vgetq_lane_u8(data, 14), vgetq_lane_u8(data, 14), vgetq_lane_u8(data, 15), vgetq_lane_u8(data, 15)
+        };
+        auto out = reinterpret_cast<uint8_t *>(output.ptr());
+
+        vst1q_u8(out, data_out1);
+        vst1q_u8(out + 16, data_out2);
+        vst1q_u8(out + offset_y_out, data_out1);
+        vst1q_u8(out + offset_y_out + 16, data_out2);
+    },
+    input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_qasymm8_nhwc(const arm_compute::Window &window)
+{
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+    window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+    Iterator input(_input, window);
+    Iterator output(_output, window_out);
+
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(uint8_t);
+    const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(uint8_t);
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const uint8x16_t data = vld1q_u8(reinterpret_cast<const uint8_t *>(input.ptr()));
+        auto             out  = reinterpret_cast<uint8_t *>(output.ptr());
+
+        vst1q_u8(out, data);
+        vst1q_u8(out + offset_y_out, data);
+        vst1q_u8(out + offset_z_out, data);
+        vst1q_u8(out + offset_y_out + offset_z_out, data);
+    },
+    input, output);
+}
+
+void NEUpsampleLayerKernel::upsample_f16_nchw(const arm_compute::Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    Window window_in(window);
+    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
+
+    Iterator  input(_input, window_in);
+    Iterator  output(_output, window_out);
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const float16x8_t data      = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
+        const float16x8_t data_out1 = { vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 0), vgetq_lane_f16(data, 1), vgetq_lane_f16(data, 1),
+                                        vgetq_lane_f16(data, 2), vgetq_lane_f16(data, 2), vgetq_lane_f16(data, 3), vgetq_lane_f16(data, 3)
+                                      };
+        const float16x8_t data_out2 = { vgetq_lane_f16(data, 4), vgetq_lane_f16(data, 4), vgetq_lane_f16(data, 5), vgetq_lane_f16(data, 5),
+                                        vgetq_lane_f16(data, 6), vgetq_lane_f16(data, 6), vgetq_lane_f16(data, 7), vgetq_lane_f16(data, 7)
+                                      };
+        auto out = reinterpret_cast<float16_t *>(output.ptr());
+
+        vst1q_f16(out, data_out1);
+        vst1q_f16(out + 8, data_out2);
+        vst1q_f16(out + offset_y_out, data_out1);
+        vst1q_f16(out + offset_y_out + 8, data_out2);
+    },
+    input, output);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+}
+
+void NEUpsampleLayerKernel::upsample_f16_nhwc(const arm_compute::Window &window)
+{
+    ARM_COMPUTE_UNUSED(window);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    Window window_out(window);
+    window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
+    window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
+
+    Iterator  input(_input, window);
+    Iterator  output(_output, window_out);
+    const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(float16_t);
+    const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(float16_t);
+
+    execute_window_loop(window_out, [&](const Coordinates & id)
+    {
+        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr()));
+        auto              out  = reinterpret_cast<float16_t *>(output.ptr());
+
+        vst1q_f16(out, data);
+        vst1q_f16(out + offset_y_out, data);
+        vst1q_f16(out + offset_z_out, data);
+        vst1q_f16(out + offset_y_out + offset_z_out, data);
+    },
+    input, output);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+}
+
+void NEUpsampleLayerKernel::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(policy);
+
+    _input  = input;
+    _output = output;
+    _info   = info;
+
+    const DataLayout data_layout = input->info()->data_layout();
+
+    TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    output->info()->set_data_layout(data_layout);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(NEUpsampleLayerKernel::validate(input->info(), output->info(), info, policy));
+
+    _num_elems_processed_per_iteration_x = 16 / output->info()->element_size();
+
+    switch(data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    _func = &NEUpsampleLayerKernel::upsample_qasymm8_nchw;
+                    break;
+                case DataType::F32:
+                    _func = &NEUpsampleLayerKernel::upsample_f32_nchw;
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    _func = &NEUpsampleLayerKernel::upsample_f16_nchw;
+                    break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            switch(input->info()->data_type())
+            {
+                case DataType::QASYMM8:
+                    _func = &NEUpsampleLayerKernel::upsample_qasymm8_nhwc;
+                    break;
+                case DataType::F32:
+                    _func = &NEUpsampleLayerKernel::upsample_f32_nhwc;
+                    break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                case DataType::F16:
+                    _func = &NEUpsampleLayerKernel::upsample_f16_nhwc;
+                    break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+                default:
+                    ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Configure window
+    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
+                                                                         output->info(),
+                                                                         _num_elems_processed_per_iteration_x,
+                                                                         info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+void NEUpsampleLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 2c9ad92..259f4fc 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -34,16 +34,12 @@
 
 namespace
 {
-template <typename T, bool is_nhwc>
+template <typename T>
 void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
 {
-    DataLayout         data_layout     = input->info()->data_layout();
-    const int          idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int          idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int          idx_channel     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const unsigned int kernel_size_x   = input->info()->dimension(idx_width);
-    const unsigned int kernel_size_y   = input->info()->dimension(idx_height);
-    const unsigned int kernel_depth    = input->info()->dimension(idx_channel);
+    const unsigned int kernel_size_x   = input->info()->dimension(0);
+    const unsigned int kernel_size_y   = input->info()->dimension(1);
+    const unsigned int kernel_depth    = input->info()->dimension(2);
     const unsigned int input_stride_x  = input->info()->strides_in_bytes().x();
     const unsigned int input_stride_y  = input->info()->strides_in_bytes().y();
     const unsigned int input_stride_z  = input->info()->strides_in_bytes().z();
@@ -71,13 +67,13 @@
                 for(unsigned int i = 0; i < kernel_size_x; ++i)
                 {
                     *(reinterpret_cast<T *>(tmp_output_ptr)) = *(reinterpret_cast<const T *>(tmp_input_ptr));
-                    tmp_input_ptr += is_nhwc ? input_stride_y : input_stride_x;
+                    tmp_input_ptr += input_stride_x;
                     tmp_output_ptr += output_stride_y;
                 }
-                curr_input_row_ptr += is_nhwc ? input_stride_z : input_stride_y;
+                curr_input_row_ptr += input_stride_y;
                 tmp_input_ptr = curr_input_row_ptr;
             }
-            curr_input_depth_ptr += is_nhwc ? input_stride_x : input_stride_z;
+            curr_input_depth_ptr += input_stride_z;
             curr_input_row_ptr = curr_input_depth_ptr;
             tmp_input_ptr      = curr_input_depth_ptr;
         }
@@ -164,24 +160,21 @@
     _bias   = bias;
     _output = output;
 
-    const DataLayout data_layout = input->info()->data_layout();
-    const bool       is_nhwc     = data_layout == DataLayout::NHWC;
-
     switch(_input->info()->element_size())
     {
         case 4:
         {
-            _func = is_nhwc ? &weights_reshape<uint32_t, true> : &weights_reshape<uint32_t, false>;
+            _func = &weights_reshape<uint32_t>;
             break;
         }
         case 2:
         {
-            _func = is_nhwc ? &weights_reshape<uint16_t, true> : &weights_reshape<uint16_t, false>;
+            _func = &weights_reshape<uint16_t>;
             break;
         }
         case 1:
         {
-            _func = is_nhwc ? &weights_reshape<uint8_t, true> : &weights_reshape<uint8_t, false>;
+            _func = &weights_reshape<uint8_t>;
             break;
         }
         default:

diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 1b38677..a84a6d9 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp

@@ -67,7 +67,6 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 3);
 
     return Status{};
 }

diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 3d7a16d..3e76a08 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp

@@ -40,19 +40,27 @@
 
 namespace
 {
+inline bool is_kernel_size_supported(Size2D size)
+{
+    const std::array<Size2D, 8> supported_input_sizes = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } };
+    return std::end(supported_input_sizes) != std::find(std::begin(supported_input_sizes), std::end(supported_input_sizes), size);
+}
+
 Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
 
-    const size_t idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != 3 && input->dimension(idx_width) != 5);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != input->dimension(idx_height));
+    const size_t idx_width    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const auto   input_width  = input->dimension(idx_width);
+    const auto   input_height = input->dimension(idx_height);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(input_width, input_height)), "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported");
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     const Size2D &output_tile = winograd_info.output_tile_size;
-    ARM_COMPUTE_RETURN_ERROR_ON(output_tile != Size2D(2U, 2U) && output_tile != Size2D(4U, 4U));
+    const std::array<Size2D, 8> supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } };
+    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile));
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -98,8 +106,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd input transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(kernel_dims.width, kernel_dims.height)),
+                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
 
     // Validate configured output
     if(output->total_size() != 0)
@@ -151,9 +159,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != 3U && kernel_dims.width != 5U), "Winograd output transform only supports 3x3 and 5x5 kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((kernel_dims.width != kernel_dims.height), "Winograd output transform only supports 3x3 and 5x5 kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((input->dimension(2) != size_t(16U)) && (input->dimension(2) != size_t(36U))), "Only 2x2 and 4x4 output tile is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(kernel_dims.width, kernel_dims.height)),
+                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
+
+    const std::array<unsigned int, 3> supported_gemm_sizes = { { 8U, 16U, 36U } };
+    ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2)));
     ARM_COMPUTE_UNUSED(kernel_dims);
     if(bias != nullptr)
     {
@@ -201,7 +211,21 @@
 }
 } // namespace
 
-// Weights transform
+template <typename T>
+Status INEWinogradLayerTransformWeightsKernel<T>::validate(const ITensorInfo *input, const ITensorInfo *weights)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    const DataLayout   data_layout = input->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(Size2D(weights->dimension(width_idx), weights->dimension(height_idx))),
+                                    "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    return Status{};
+}
+
+template class INEWinogradLayerTransformWeightsKernel<float>;
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 unsigned int NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const
@@ -225,6 +249,7 @@
     return WinogradConv::get_kernel_matrix_stride(kernel_shape);
 }
 
+#ifndef DOXYGEN_SKIP_THIS
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure(
     const ITensor *weights_hwio,
@@ -246,6 +271,7 @@
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     INEKernel::configure(win);
 }
+#endif /* DOXYGEN_SKIP_THIS */
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
 void NEWinogradLayerTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run(const Window &window, const ThreadInfo &info)
@@ -278,7 +304,13 @@
 template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>;
 template class NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>;
 template class NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>;
 
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>;
 // Input transform
 
 template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
@@ -343,14 +375,15 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 
-    const int element_size_in_bytes = _input_nhwc->info()->element_size();
-    const int input_col_stride      = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
-    const int input_row_stride      = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
-    const int input_batch_stride    = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
-
-    InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes()),
+    const int      element_size_in_bytes = _input_nhwc->info()->element_size();
+    const int      input_col_stride      = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
+    const int      input_row_stride      = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
+    const int      input_batch_stride    = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
+    const auto     input_nhwc_ptr        = reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes());
+    auto           output_ptr            = reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes());
+    InputTransform input_transform(input_nhwc_ptr,
                                    _num_batches, _num_rows, _num_cols, _num_channels, _padding,
-                                   reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes()),
+                                   output_ptr,
                                    _matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride);
 
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
@@ -371,6 +404,13 @@
 template class NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>;
 template class NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>;
 template class NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>;
+
+template class NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>;
 
 // Output transform
 
@@ -438,7 +478,6 @@
     Window win;
     auto   win_last = output_transform.get_window();
     win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-
     _output_nhwc->info()->set_valid_region(ValidRegion(Coordinates(), _output_nhwc->info()->tensor_shape()));
 
     INEKernel::configure(win);
@@ -452,10 +491,14 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_workspace);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
 
+    const int out_batch_stride = 0;
+    const int out_row_stride   = _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T);
+    const int out_col_stride   = _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T);
+
     OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
                                      (_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr),
                                      reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()),
-                                     _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T));
+                                     _num_batches, _num_rows, _num_cols, _num_channels, out_batch_stride, out_row_stride, out_col_stride);
 
     // The code below cannot be moved to configure because biases hasn't been allocated at that point
     const size_t fst = window.x().start();
@@ -478,5 +521,12 @@
 template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>;
 template class NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>;
 template class NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>;
+template class NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>;
+template class NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>;
+
+template class NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>;
+template class NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>;
+template class NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>;
+template class NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>;
 
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
new file mode 100644
index 0000000..009562b
--- /dev/null
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp

@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "arm_compute/core/QAsymm8.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+using namespace arm_compute;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_classes <= 0);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % (num_classes + 5)) != 0);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    if(output != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, *input);
+    }
+
+    const bool         is_nchw                           = input->data_layout() == DataLayout::NCHW;
+    const unsigned int num_elems_processed_per_iteration = is_nchw ? 16 / input->element_size() : 1;
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    if(output != nullptr)
+    {
+        AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+    else
+    {
+        window_changed = update_window_and_padding(win, AccessWindowHorizontal(input, 0, num_elems_processed_per_iteration));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEYOLOLayerKernel::NEYOLOLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _act_info(), _num_classes()
+{
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp32_nchw(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x4_t res = vld1q_f32(reinterpret_cast<float *>(input.ptr()));
+
+        const int  box_ch_id = id.z() % (_num_classes + 5);
+        const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+        // Perform activation
+        if(activate)
+        {
+            auto activation = ::detail::logistic<float, 4>(_act_info);
+            activation(res);
+        }
+
+        // Store results
+        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp32_nhwc(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float res = *(reinterpret_cast<float *>(input.ptr()));
+
+        const int  box_ch_id = id.x() % (_num_classes + 5);
+        const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+        // Perform activation
+        if(activate)
+        {
+            res = 1.f / (1.f + std::exp(-res));
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+void NEYOLOLayerKernel::yolo_layer_fp16_nchw(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float16x8_t res = vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()));
+
+        const int  box_ch_id = id.z() % (_num_classes + 5);
+        const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+        // Perform activation
+        if(activate)
+        {
+            auto activation = ::detail::logistic<float16_t, 8>(_act_info);
+            activation(res);
+        }
+
+        // Store results
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+    },
+    input, output);
+}
+
+void NEYOLOLayerKernel::yolo_layer_fp16_nhwc(const Window &window)
+{
+    Iterator input(_input, window);
+    Iterator output(_output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float16_t res = *(reinterpret_cast<float16_t *>(input.ptr()));
+
+        const int  box_ch_id = id.x() % (_num_classes + 5);
+        const bool activate  = box_ch_id != 2 && box_ch_id != 3;
+
+        // Perform activation
+        if(activate)
+        {
+            res = 1.f / (1.f + std::exp(-res));
+        }
+
+        // Store result
+        *(reinterpret_cast<float16_t *>(output.ptr())) = res;
+    },
+    input, output);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+void NEYOLOLayerKernel::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info, num_classes));
+
+    _input       = input;
+    _output      = output;
+    _act_info    = act_info;
+    _num_classes = num_classes;
+
+    switch(_input->info()->data_type())
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = (_input->info()->data_layout() == DataLayout::NHWC) ? &NEYOLOLayerKernel::yolo_layer_fp16_nhwc : &NEYOLOLayerKernel::yolo_layer_fp16_nchw;
+            break;
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F32:
+            _func = (_input->info()->data_layout() == DataLayout::NHWC) ? &NEYOLOLayerKernel::yolo_layer_fp32_nhwc : &NEYOLOLayerKernel::yolo_layer_fp32_nchw;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Element size not supported");
+            break;
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
+}
+
+Status NEYOLOLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, act_info, num_classes));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output == nullptr) ? nullptr : output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEYOLOLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (this->*_func)(window);
+}

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 4579ebd..9194bdd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp

@@ -34,10 +34,22 @@
 #include "kernels/a64_hgemm_24x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a32_sgemm_8x6.hpp"
+#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
 
 namespace arm_gemm {
 
-#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
+public:
+
+    UniqueGemmCommon<__fp16, __fp16> instantiate(const GemmArgs<__fp16> &args) override {
+        return UniqueGemmCommon<__fp16, __fp16>(new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args));
+    }
+
+    GemmImpl_gemm_fp16_interleaved_fp16() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED_FP16) { }
+};
+
+#elif defined(__aarch64__)
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS)
 class GemmImpl_gemm_fp16_interleaved_fp16 : public GemmImplementation<__fp16, __fp16> {
@@ -73,13 +85,13 @@
     GemmImpl_gemm_fp16_interleaved() : GemmImplementation<__fp16, __fp16>(GemmMethod::GEMM_INTERLEAVED) { }
 };
 
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
 static GemmImpl_gemm_fp16_interleaved_fp16 gemm_fp16_interleaved_fp16_impl{};
 #endif
 static GemmImpl_gemm_fp16_interleaved gemm_fp16_interleaved_impl{};
 
 static std::vector<GemmImplementation<__fp16, __fp16> *> gemm_fp16_methods = {
-#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
+#if defined(__aarch64__) && (defined(__ARM_FEATURE_VECTOR_ARITHMETIC) || defined(FP16_KERNELS) || defined(__ARM_FEATURE_SVE))
     &gemm_fp16_interleaved_fp16_impl,
 #endif
     &gemm_fp16_interleaved_impl

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index e840e90..7d14971 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

@@ -36,10 +36,12 @@
 #include "kernels/a64_sgemv_pretransposed.hpp"
 #include "kernels/a64_sgemm_native_16x4.hpp"
 
+#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+
 namespace arm_gemm {
 
-#ifdef __aarch64__
-// SGEMM implementations for AArch64
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
+// SGEMM implementations for AArch64 without SVE
 
 // Pretransposed GEMV
 class GemmImpl_sgemm_gemv_pretransposed : public GemmImplementation<float, float> {
@@ -92,7 +94,9 @@
 class GemmImpl_sgemm_gemm_interleaved : public GemmImplementation<float, float> {
 public:
     UniqueGemmCommon<float, float> instantiate(const GemmArgs<float> &args) override {
-#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+        return UniqueGemmCommon<float, float> (new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args));
+#elif defined(__aarch64__)
         return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_12x8, float, float>(args));
 #elif defined(__arm__)
         return UniqueGemmCommon<float, float> (new GemmInterleaved<sgemm_8x6, float, float>(args));
@@ -105,7 +109,7 @@
 };
 
 static GemmImpl_gemv_batched<float, float> gemv_batched_impl{};
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
 static GemmImpl_sgemm_gemv_pretransposed sgemm_gemv_pretransposed_impl{};
 static GemmImpl_sgemm_gemv_native_transposed sgemm_gemv_native_transposed_impl{};
 static GemmImpl_sgemm_gemm_native sgemm_gemm_native_impl{};
@@ -115,7 +119,7 @@
 /* List of implementations (order matters) */
 static std::vector<GemmImplementation<float, float> *> SGemmMethods = {
     &gemv_batched_impl,
-#ifdef __aarch64__
+#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
     &sgemm_gemv_pretransposed_impl,
     &sgemm_gemv_native_transposed_impl,
     &sgemm_gemm_native_impl,

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index b7e8fa2..ad171a7 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp

@@ -59,4 +59,4 @@
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __aarch64__
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index dffa056..627d8ab 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp

@@ -31,9 +31,21 @@
 #include "kernels/a64_gemm_s16_12x8.hpp"
 #include "kernels/a64_gemm_s8_12x8.hpp"
 #include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
 
 namespace arm_gemm {
 
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
+public:
+    UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {
+        return UniqueGemmCommon<int8_t, int32_t>(new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args));
+    }
+
+    GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
+};
+#else
+
 class GemmImpl_gemm_s8_interleaved_dot : public GemmImplementation<int8_t, int32_t> {
 public:
     bool is_supported(const GemmArgs<int32_t> &args) override {
@@ -47,6 +59,8 @@
     GemmImpl_gemm_s8_interleaved_dot() : GemmImplementation<int8_t, int32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
 };
 
+#endif
+
 class GemmImpl_gemm_s8_interleaved : public GemmImplementation<int8_t, int32_t> {
 public:
     UniqueGemmCommon<int8_t, int32_t> instantiate(const GemmArgs<int32_t> &args) override {

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index bfa4908..0e58a4d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp

@@ -450,6 +450,7 @@
         return _pretransposed && (_B_transposed==nullptr);
     }
 
+    // TODO: this could almost certainly be considerably simpler.
     size_t get_B_pretransposed_array_size() const override {
         size_t total=0;
         blockwalker current(*this);

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index 6bc7df0..baa1316 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp

@@ -76,7 +76,7 @@
 
     GemmNative(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K, const unsigned int nbatches, const unsigned int nmultis, const Tr beta) :
         _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmultis(nmultis), _beta(beta), _ci(ci) {
-        /* For now don't do any blocking.*/
+        /* For now don't do any blocking. TODO: figure out if we should. */
         k_block = K;
         n_block = N;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 60b7954..b7c1bab 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp

@@ -31,9 +31,20 @@
 #include "kernels/a64_gemm_u16_12x8.hpp"
 #include "kernels/a64_gemm_u8_12x8.hpp"
 #include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
 
 namespace arm_gemm {
 
+#ifdef __ARM_FEATURE_SVE
+class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
+public:
+    UniqueGemmCommon<uint8_t, uint32_t> instantiate(const GemmArgs<uint32_t> &args) override {
+        return UniqueGemmCommon<uint8_t, uint32_t>(new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args));
+    }
+
+    GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
+};
+#else
 class GemmImpl_gemm_u8_interleaved_dot : public GemmImplementation<uint8_t, uint32_t> {
 public:
     bool is_supported(const GemmArgs<uint32_t> &args) override {
@@ -46,6 +57,7 @@
 
     GemmImpl_gemm_u8_interleaved_dot() : GemmImplementation<uint8_t, uint32_t>(GemmMethod::GEMM_INTERLEAVED_DOT) { }
 };
+#endif
 
 class GemmImpl_gemm_u8_interleaved : public GemmImplementation<uint8_t, uint32_t> {
 public:

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index e37d4c5..241c5fe 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp

@@ -65,7 +65,7 @@
     GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
 
     GemvNativeTransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const Tr beta) : _Nsize(N), _Ksize(K), _nmultis(nmultis), _beta(beta), _ci(ci) {
-        /* For now don't do any blocking.*/
+        /* For now don't do any blocking. TODO: figure out if we should. */
         m_block = K;
         n_block = N;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index d745883..e53ddb2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp

@@ -71,7 +71,7 @@
     GemvPretransposed(const CPUInfo *ci, const unsigned int N, const unsigned int K, const unsigned int nmultis, const bool trB, const Tr beta) :
         _Nsize(N), _Ksize(K), _nmultis(nmultis), _trB(trB), _beta(beta), _ci(ci),
         _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave) * strategy::A_interleave) {
-        /* For now don't do any blocking.*/
+        /* For now don't do any blocking. TODO: figure out if we should. */
         m_block = K;
         n_block = N;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
new file mode 100644
index 0000000..3fd738e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+class interleaved_fp16_mla_3VLx8 {
+public:
+    typedef __fp16 operand_type;
+    typedef __fp16 result_type;
+
+    typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+
+    /* Kernel blocking parameters */
+    static int out_width()
+    {
+        return svcnth() * 3;
+    }
+
+    static int out_height()
+    {
+        return 8;
+    }
+
+    static int k_unroll()
+    {
+        return 1;
+    }
+
+    // Use the standard fixed size transforms.
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+
+    kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+
+    interleaved_fp16_mla_3VLx8(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
new file mode 100644
index 0000000..92ec888
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp

@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+    const __fp16 *a_ptr = Apanel;
+    __fp16 *c_ptr = Cpanel;
+
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "mov z8.h, #0\n"
+                "ptrue p0.h\n"
+                "mov z9.h, #0\n"
+                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+                "mov z10.h, #0\n"
+                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+                "mov z11.h, #0\n"
+                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "mov z12.h, #0\n"
+                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "mov z13.h, #0\n"
+                "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
+                "mov z14.h, #0\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
+                "mov z15.h, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "mov z16.h, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "mov z17.h, #0\n"
+                "mov z18.h, #0\n"
+                "mov z19.h, #0\n"
+                "mov z20.h, #0\n"
+                "mov z21.h, #0\n"
+                "mov z22.h, #0\n"
+                "mov z23.h, #0\n"
+                "mov z24.h, #0\n"
+                "mov z25.h, #0\n"
+                "mov z26.h, #0\n"
+                "mov z27.h, #0\n"
+                "mov z28.h, #0\n"
+                "mov z29.h, #0\n"
+                "mov z30.h, #0\n"
+                "mov z31.h, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                "fmla z8.h, z2.h, z0.h[0]\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.h, z2.h, z0.h[1]\n"
+                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.h, z2.h, z0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                "fmla z11.h, z2.h, z0.h[3]\n"
+                "fmla z12.h, z2.h, z0.h[4]\n"
+                "fmla z13.h, z2.h, z0.h[5]\n"
+                "fmla z14.h, z2.h, z0.h[6]\n"
+                "fmla z15.h, z2.h, z0.h[7]\n"
+                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+                "fmla z16.h, z3.h, z0.h[0]\n"
+                "fmla z17.h, z3.h, z0.h[1]\n"
+                "fmla z18.h, z3.h, z0.h[2]\n"
+                "fmla z19.h, z3.h, z0.h[3]\n"
+                "fmla z20.h, z3.h, z0.h[4]\n"
+                "fmla z21.h, z3.h, z0.h[5]\n"
+                "fmla z22.h, z3.h, z0.h[6]\n"
+                "fmla z23.h, z3.h, z0.h[7]\n"
+                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "fmla z24.h, z4.h, z0.h[0]\n"
+                "fmla z25.h, z4.h, z0.h[1]\n"
+                "fmla z26.h, z4.h, z0.h[2]\n"
+                "fmla z27.h, z4.h, z0.h[3]\n"
+                "fmla z28.h, z4.h, z0.h[4]\n"
+                "fmla z29.h, z4.h, z0.h[5]\n"
+                "fmla z30.h, z4.h, z0.h[6]\n"
+                "fmla z31.h, z4.h, z0.h[7]\n"
+                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "fmla z8.h, z5.h, z1.h[0]\n"
+                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+                "fmla z9.h, z5.h, z1.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "fmla z10.h, z5.h, z1.h[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "fmla z11.h, z5.h, z1.h[3]\n"
+                "fmla z12.h, z5.h, z1.h[4]\n"
+                "fmla z13.h, z5.h, z1.h[5]\n"
+                "fmla z14.h, z5.h, z1.h[6]\n"
+                "fmla z15.h, z5.h, z1.h[7]\n"
+                "ld1h z5.h, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "fmla z16.h, z6.h, z1.h[0]\n"
+                "fmla z17.h, z6.h, z1.h[1]\n"
+                "fmla z18.h, z6.h, z1.h[2]\n"
+                "fmla z19.h, z6.h, z1.h[3]\n"
+                "fmla z20.h, z6.h, z1.h[4]\n"
+                "fmla z21.h, z6.h, z1.h[5]\n"
+                "fmla z22.h, z6.h, z1.h[6]\n"
+                "fmla z23.h, z6.h, z1.h[7]\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "fmla z24.h, z7.h, z1.h[0]\n"
+                "fmla z25.h, z7.h, z1.h[1]\n"
+                "fmla z26.h, z7.h, z1.h[2]\n"
+                "fmla z27.h, z7.h, z1.h[3]\n"
+                "fmla z28.h, z7.h, z1.h[4]\n"
+                "fmla z29.h, z7.h, z1.h[5]\n"
+                "fmla z30.h, z7.h, z1.h[6]\n"
+                "fmla z31.h, z7.h, z1.h[7]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                "fmla z8.h, z2.h, z0.h[0]\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.h, z2.h, z0.h[1]\n"
+                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.h, z2.h, z0.h[2]\n"
+                "fmla z11.h, z2.h, z0.h[3]\n"
+                "fmla z12.h, z2.h, z0.h[4]\n"
+                "fmla z13.h, z2.h, z0.h[5]\n"
+                "fmla z14.h, z2.h, z0.h[6]\n"
+                "fmla z15.h, z2.h, z0.h[7]\n"
+                "ld1h z2.h, p0/z, [%[b_ptr]]\n"
+                "fmla z16.h, z3.h, z0.h[0]\n"
+                "fmla z17.h, z3.h, z0.h[1]\n"
+                "fmla z18.h, z3.h, z0.h[2]\n"
+                "fmla z19.h, z3.h, z0.h[3]\n"
+                "fmla z20.h, z3.h, z0.h[4]\n"
+                "fmla z21.h, z3.h, z0.h[5]\n"
+                "fmla z22.h, z3.h, z0.h[6]\n"
+                "fmla z23.h, z3.h, z0.h[7]\n"
+                "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "fmla z24.h, z4.h, z0.h[0]\n"
+                "fmla z25.h, z4.h, z0.h[1]\n"
+                "fmla z26.h, z4.h, z0.h[2]\n"
+                "fmla z27.h, z4.h, z0.h[3]\n"
+                "fmla z28.h, z4.h, z0.h[4]\n"
+                "fmla z29.h, z4.h, z0.h[5]\n"
+                "fmla z30.h, z4.h, z0.h[6]\n"
+                "fmla z31.h, z4.h, z0.h[7]\n"
+                "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "fmla z8.h, z5.h, z1.h[0]\n"
+                "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
+                "fmla z9.h, z5.h, z1.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x10\n"
+                "fmla z10.h, z5.h, z1.h[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "fmla z11.h, z5.h, z1.h[3]\n"
+                "fmla z12.h, z5.h, z1.h[4]\n"
+                "fmla z13.h, z5.h, z1.h[5]\n"
+                "fmla z14.h, z5.h, z1.h[6]\n"
+                "fmla z15.h, z5.h, z1.h[7]\n"
+                "fmla z16.h, z6.h, z1.h[0]\n"
+                "fmla z17.h, z6.h, z1.h[1]\n"
+                "fmla z18.h, z6.h, z1.h[2]\n"
+                "fmla z19.h, z6.h, z1.h[3]\n"
+                "fmla z20.h, z6.h, z1.h[4]\n"
+                "fmla z21.h, z6.h, z1.h[5]\n"
+                "fmla z22.h, z6.h, z1.h[6]\n"
+                "fmla z23.h, z6.h, z1.h[7]\n"
+                "fmla z24.h, z7.h, z1.h[0]\n"
+                "fmla z25.h, z7.h, z1.h[1]\n"
+                "fmla z26.h, z7.h, z1.h[2]\n"
+                "fmla z27.h, z7.h, z1.h[3]\n"
+                "fmla z28.h, z7.h, z1.h[4]\n"
+                "fmla z29.h, z7.h, z1.h[5]\n"
+                "fmla z30.h, z7.h, z1.h[6]\n"
+                "fmla z31.h, z7.h, z1.h[7]\n"
+                "fmla z8.h, z2.h, z0.h[0]\n"
+                "st1h z8.h, p0, [%[c_ptr]]\n"
+                "fmla z9.h, z2.h, z0.h[1]\n"
+                "fmla z10.h, z2.h, z0.h[2]\n"
+                "fmla z11.h, z2.h, z0.h[3]\n"
+                "fmla z12.h, z2.h, z0.h[4]\n"
+                "fmla z13.h, z2.h, z0.h[5]\n"
+                "fmla z14.h, z2.h, z0.h[6]\n"
+                "fmla z15.h, z2.h, z0.h[7]\n"
+                "fmla z16.h, z3.h, z0.h[0]\n"
+                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z17.h, z3.h, z0.h[1]\n"
+                "fmla z18.h, z3.h, z0.h[2]\n"
+                "fmla z19.h, z3.h, z0.h[3]\n"
+                "fmla z20.h, z3.h, z0.h[4]\n"
+                "fmla z21.h, z3.h, z0.h[5]\n"
+                "fmla z22.h, z3.h, z0.h[6]\n"
+                "fmla z23.h, z3.h, z0.h[7]\n"
+                "fmla z24.h, z4.h, z0.h[0]\n"
+                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z25.h, z4.h, z0.h[1]\n"
+                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+                "fmla z26.h, z4.h, z0.h[2]\n"
+                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+                "fmla z27.h, z4.h, z0.h[3]\n"
+                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+                "fmla z28.h, z4.h, z0.h[4]\n"
+                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "fmla z29.h, z4.h, z0.h[5]\n"
+                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+                "fmla z30.h, z4.h, z0.h[6]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "fmla z31.h, z4.h, z0.h[7]\n"
+                "b 4f\n"
+                "3:\n"
+                "fmla z8.h, z2.h, z0.h[0]\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.h, z2.h, z0.h[1]\n"
+                "ld1rqh z1.h, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.h, z2.h, z0.h[2]\n"
+                "fmla z11.h, z2.h, z0.h[3]\n"
+                "fmla z12.h, z2.h, z0.h[4]\n"
+                "fmla z13.h, z2.h, z0.h[5]\n"
+                "fmla z14.h, z2.h, z0.h[6]\n"
+                "fmla z15.h, z2.h, z0.h[7]\n"
+                "fmla z16.h, z3.h, z0.h[0]\n"
+                "fmla z17.h, z3.h, z0.h[1]\n"
+                "fmla z18.h, z3.h, z0.h[2]\n"
+                "fmla z19.h, z3.h, z0.h[3]\n"
+                "fmla z20.h, z3.h, z0.h[4]\n"
+                "fmla z21.h, z3.h, z0.h[5]\n"
+                "fmla z22.h, z3.h, z0.h[6]\n"
+                "fmla z23.h, z3.h, z0.h[7]\n"
+                "fmla z24.h, z4.h, z0.h[0]\n"
+                "fmla z25.h, z4.h, z0.h[1]\n"
+                "fmla z26.h, z4.h, z0.h[2]\n"
+                "fmla z27.h, z4.h, z0.h[3]\n"
+                "fmla z28.h, z4.h, z0.h[4]\n"
+                "fmla z29.h, z4.h, z0.h[5]\n"
+                "fmla z30.h, z4.h, z0.h[6]\n"
+                "fmla z31.h, z4.h, z0.h[7]\n"
+                "fmla z8.h, z5.h, z1.h[0]\n"
+                "st1h z8.h, p0, [%[c_ptr]]\n"
+                "fmla z9.h, z5.h, z1.h[1]\n"
+                "fmla z10.h, z5.h, z1.h[2]\n"
+                "fmla z11.h, z5.h, z1.h[3]\n"
+                "fmla z12.h, z5.h, z1.h[4]\n"
+                "fmla z13.h, z5.h, z1.h[5]\n"
+                "fmla z14.h, z5.h, z1.h[6]\n"
+                "fmla z15.h, z5.h, z1.h[7]\n"
+                "fmla z16.h, z6.h, z1.h[0]\n"
+                "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z17.h, z6.h, z1.h[1]\n"
+                "fmla z18.h, z6.h, z1.h[2]\n"
+                "fmla z19.h, z6.h, z1.h[3]\n"
+                "fmla z20.h, z6.h, z1.h[4]\n"
+                "fmla z21.h, z6.h, z1.h[5]\n"
+                "fmla z22.h, z6.h, z1.h[6]\n"
+                "fmla z23.h, z6.h, z1.h[7]\n"
+                "fmla z24.h, z7.h, z1.h[0]\n"
+                "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z25.h, z7.h, z1.h[1]\n"
+                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
+                "fmla z26.h, z7.h, z1.h[2]\n"
+                "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
+                "fmla z27.h, z7.h, z1.h[3]\n"
+                "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
+                "fmla z28.h, z7.h, z1.h[4]\n"
+                "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "fmla z29.h, z7.h, z1.h[5]\n"
+                "st1h z18.h, p0, [%[c_ptr], #7, MUL VL]\n"
+                "fmla z30.h, z7.h, z1.h[6]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "fmla z31.h, z7.h, z1.h[7]\n"
+                "4:\n"
+                "st1h z26.h, p0, [%[c_ptr], #-8, MUL VL]\n"
+                "st1h z11.h, p0, [%[c_ptr], #-7, MUL VL]\n"
+                "st1h z19.h, p0, [%[c_ptr], #-6, MUL VL]\n"
+                "st1h z27.h, p0, [%[c_ptr], #-5, MUL VL]\n"
+                "st1h z12.h, p0, [%[c_ptr], #-4, MUL VL]\n"
+                "st1h z20.h, p0, [%[c_ptr], #-3, MUL VL]\n"
+                "st1h z28.h, p0, [%[c_ptr], #-2, MUL VL]\n"
+                "st1h z13.h, p0, [%[c_ptr], #-1, MUL VL]\n"
+                "st1h z21.h, p0, [%[c_ptr]]\n"
+                "st1h z29.h, p0, [%[c_ptr], #1, MUL VL]\n"
+                "st1h z14.h, p0, [%[c_ptr], #2, MUL VL]\n"
+                "st1h z22.h, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1h z30.h, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1h z15.h, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1h z23.h, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1h z31.h, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #8\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
new file mode 100644
index 0000000..b2327f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+
+class interleaved_fp32_mla_3VLx8 {
+public:
+    typedef float operand_type;
+    typedef float result_type;
+
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
+
+    /* Kernel blocking parameters */
+    static int out_width()
+    {
+        return svcntw() * 3;
+    }
+
+    static int out_height()
+    {
+        return 8;
+    }
+
+    static int k_unroll()
+    {
+        return 1;
+    }
+
+    // Use the standard fixed size transforms.
+    StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
+
+    kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+
+    interleaved_fp32_mla_3VLx8(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
new file mode 100644
index 0000000..bb08fc7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp

@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "mov z8.s, #0\n"
+                "ptrue p0.s\n"
+                "mov z9.s, #0\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                "mov z10.s, #0\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                "mov z11.s, #0\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                "mov z12.s, #0\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "mov z13.s, #0\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+                "mov z14.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "mov z15.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z16.s, #0\n"
+                "mov z17.s, #0\n"
+                "mov z18.s, #0\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
+                "mov z21.s, #0\n"
+                "mov z22.s, #0\n"
+                "mov z23.s, #0\n"
+                "mov z24.s, #0\n"
+                "mov z25.s, #0\n"
+                "mov z26.s, #0\n"
+                "mov z27.s, #0\n"
+                "mov z28.s, #0\n"
+                "mov z29.s, #0\n"
+                "mov z30.s, #0\n"
+                "mov z31.s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                "fmla z8.s, z4.s, z0.s[0]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.s, z4.s, z0.s[1]\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.s, z4.s, z0.s[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                "fmla z11.s, z4.s, z0.s[3]\n"
+                "fmla z20.s, z4.s, z1.s[0]\n"
+                "fmla z21.s, z4.s, z1.s[1]\n"
+                "fmla z22.s, z4.s, z1.s[2]\n"
+                "fmla z23.s, z4.s, z1.s[3]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                "fmla z12.s, z5.s, z0.s[0]\n"
+                "fmla z13.s, z5.s, z0.s[1]\n"
+                "fmla z14.s, z5.s, z0.s[2]\n"
+                "fmla z15.s, z5.s, z0.s[3]\n"
+                "fmla z24.s, z5.s, z1.s[0]\n"
+                "fmla z25.s, z5.s, z1.s[1]\n"
+                "fmla z26.s, z5.s, z1.s[2]\n"
+                "fmla z27.s, z5.s, z1.s[3]\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "fmla z16.s, z6.s, z0.s[0]\n"
+                "fmla z17.s, z6.s, z0.s[1]\n"
+                "fmla z18.s, z6.s, z0.s[2]\n"
+                "fmla z19.s, z6.s, z0.s[3]\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                "fmla z28.s, z6.s, z1.s[0]\n"
+                "fmla z29.s, z6.s, z1.s[1]\n"
+                "fmla z30.s, z6.s, z1.s[2]\n"
+                "fmla z31.s, z6.s, z1.s[3]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "fmla z8.s, z4.s, z2.s[0]\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                "fmla z9.s, z4.s, z2.s[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "fmla z10.s, z4.s, z2.s[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "fmla z11.s, z4.s, z2.s[3]\n"
+                "fmla z20.s, z4.s, z3.s[0]\n"
+                "fmla z21.s, z4.s, z3.s[1]\n"
+                "fmla z22.s, z4.s, z3.s[2]\n"
+                "fmla z23.s, z4.s, z3.s[3]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "fmla z12.s, z5.s, z2.s[0]\n"
+                "fmla z13.s, z5.s, z2.s[1]\n"
+                "fmla z14.s, z5.s, z2.s[2]\n"
+                "fmla z15.s, z5.s, z2.s[3]\n"
+                "fmla z24.s, z5.s, z3.s[0]\n"
+                "fmla z25.s, z5.s, z3.s[1]\n"
+                "fmla z26.s, z5.s, z3.s[2]\n"
+                "fmla z27.s, z5.s, z3.s[3]\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "fmla z16.s, z6.s, z2.s[0]\n"
+                "fmla z17.s, z6.s, z2.s[1]\n"
+                "fmla z18.s, z6.s, z2.s[2]\n"
+                "fmla z19.s, z6.s, z2.s[3]\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+                "fmla z28.s, z6.s, z3.s[0]\n"
+                "fmla z29.s, z6.s, z3.s[1]\n"
+                "fmla z30.s, z6.s, z3.s[2]\n"
+                "fmla z31.s, z6.s, z3.s[3]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                "fmla z8.s, z4.s, z0.s[0]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.s, z4.s, z0.s[1]\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.s, z4.s, z0.s[2]\n"
+                "fmla z11.s, z4.s, z0.s[3]\n"
+                "fmla z20.s, z4.s, z1.s[0]\n"
+                "fmla z21.s, z4.s, z1.s[1]\n"
+                "fmla z22.s, z4.s, z1.s[2]\n"
+                "fmla z23.s, z4.s, z1.s[3]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                "fmla z12.s, z5.s, z0.s[0]\n"
+                "fmla z13.s, z5.s, z0.s[1]\n"
+                "fmla z14.s, z5.s, z0.s[2]\n"
+                "fmla z15.s, z5.s, z0.s[3]\n"
+                "fmla z24.s, z5.s, z1.s[0]\n"
+                "fmla z25.s, z5.s, z1.s[1]\n"
+                "fmla z26.s, z5.s, z1.s[2]\n"
+                "fmla z27.s, z5.s, z1.s[3]\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "fmla z16.s, z6.s, z0.s[0]\n"
+                "fmla z17.s, z6.s, z0.s[1]\n"
+                "fmla z18.s, z6.s, z0.s[2]\n"
+                "fmla z19.s, z6.s, z0.s[3]\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                "fmla z28.s, z6.s, z1.s[0]\n"
+                "fmla z29.s, z6.s, z1.s[1]\n"
+                "fmla z30.s, z6.s, z1.s[2]\n"
+                "fmla z31.s, z6.s, z1.s[3]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "fmla z8.s, z4.s, z2.s[0]\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                "fmla z9.s, z4.s, z2.s[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "fmla z10.s, z4.s, z2.s[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "fmla z11.s, z4.s, z2.s[3]\n"
+                "fmla z20.s, z4.s, z3.s[0]\n"
+                "fmla z21.s, z4.s, z3.s[1]\n"
+                "fmla z22.s, z4.s, z3.s[2]\n"
+                "fmla z23.s, z4.s, z3.s[3]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "fmla z12.s, z5.s, z2.s[0]\n"
+                "fmla z13.s, z5.s, z2.s[1]\n"
+                "fmla z14.s, z5.s, z2.s[2]\n"
+                "fmla z15.s, z5.s, z2.s[3]\n"
+                "fmla z24.s, z5.s, z3.s[0]\n"
+                "fmla z25.s, z5.s, z3.s[1]\n"
+                "fmla z26.s, z5.s, z3.s[2]\n"
+                "fmla z27.s, z5.s, z3.s[3]\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "fmla z16.s, z6.s, z2.s[0]\n"
+                "fmla z17.s, z6.s, z2.s[1]\n"
+                "fmla z18.s, z6.s, z2.s[2]\n"
+                "fmla z19.s, z6.s, z2.s[3]\n"
+                "fmla z28.s, z6.s, z3.s[0]\n"
+                "fmla z29.s, z6.s, z3.s[1]\n"
+                "fmla z30.s, z6.s, z3.s[2]\n"
+                "fmla z31.s, z6.s, z3.s[3]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z8.s, z4.s, z0.s[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "fmla z9.s, z4.s, z0.s[1]\n"
+                "fmla z10.s, z4.s, z0.s[2]\n"
+                "fmla z11.s, z4.s, z0.s[3]\n"
+                "fmla z20.s, z4.s, z1.s[0]\n"
+                "fmla z21.s, z4.s, z1.s[1]\n"
+                "fmla z22.s, z4.s, z1.s[2]\n"
+                "fmla z23.s, z4.s, z1.s[3]\n"
+                "fmla z12.s, z5.s, z0.s[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z13.s, z5.s, z0.s[1]\n"
+                "fmla z14.s, z5.s, z0.s[2]\n"
+                "fmla z15.s, z5.s, z0.s[3]\n"
+                "fmla z24.s, z5.s, z1.s[0]\n"
+                "fmla z25.s, z5.s, z1.s[1]\n"
+                "fmla z26.s, z5.s, z1.s[2]\n"
+                "fmla z27.s, z5.s, z1.s[3]\n"
+                "fmla z16.s, z6.s, z0.s[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z17.s, z6.s, z0.s[1]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "fmla z18.s, z6.s, z0.s[2]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "fmla z19.s, z6.s, z0.s[3]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "fmla z28.s, z6.s, z1.s[0]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "fmla z29.s, z6.s, z1.s[1]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "fmla z30.s, z6.s, z1.s[2]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "fmla z31.s, z6.s, z1.s[3]\n"
+                "b 4f\n"
+                "3:\n"
+                "fmla z8.s, z4.s, z0.s[0]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z9.s, z4.s, z0.s[1]\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                "fmla z10.s, z4.s, z0.s[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "fmla z11.s, z4.s, z0.s[3]\n"
+                "fmla z20.s, z4.s, z1.s[0]\n"
+                "fmla z21.s, z4.s, z1.s[1]\n"
+                "fmla z22.s, z4.s, z1.s[2]\n"
+                "fmla z23.s, z4.s, z1.s[3]\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "fmla z12.s, z5.s, z0.s[0]\n"
+                "fmla z13.s, z5.s, z0.s[1]\n"
+                "fmla z14.s, z5.s, z0.s[2]\n"
+                "fmla z15.s, z5.s, z0.s[3]\n"
+                "fmla z24.s, z5.s, z1.s[0]\n"
+                "fmla z25.s, z5.s, z1.s[1]\n"
+                "fmla z26.s, z5.s, z1.s[2]\n"
+                "fmla z27.s, z5.s, z1.s[3]\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "fmla z16.s, z6.s, z0.s[0]\n"
+                "fmla z17.s, z6.s, z0.s[1]\n"
+                "fmla z18.s, z6.s, z0.s[2]\n"
+                "fmla z19.s, z6.s, z0.s[3]\n"
+                "fmla z28.s, z6.s, z1.s[0]\n"
+                "fmla z29.s, z6.s, z1.s[1]\n"
+                "fmla z30.s, z6.s, z1.s[2]\n"
+                "fmla z31.s, z6.s, z1.s[3]\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "fmla z8.s, z4.s, z2.s[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "fmla z9.s, z4.s, z2.s[1]\n"
+                "fmla z10.s, z4.s, z2.s[2]\n"
+                "fmla z11.s, z4.s, z2.s[3]\n"
+                "fmla z20.s, z4.s, z3.s[0]\n"
+                "fmla z21.s, z4.s, z3.s[1]\n"
+                "fmla z22.s, z4.s, z3.s[2]\n"
+                "fmla z23.s, z4.s, z3.s[3]\n"
+                "fmla z12.s, z5.s, z2.s[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z13.s, z5.s, z2.s[1]\n"
+                "fmla z14.s, z5.s, z2.s[2]\n"
+                "fmla z15.s, z5.s, z2.s[3]\n"
+                "fmla z24.s, z5.s, z3.s[0]\n"
+                "fmla z25.s, z5.s, z3.s[1]\n"
+                "fmla z26.s, z5.s, z3.s[2]\n"
+                "fmla z27.s, z5.s, z3.s[3]\n"
+                "fmla z16.s, z6.s, z2.s[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z17.s, z6.s, z2.s[1]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "fmla z18.s, z6.s, z2.s[2]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "fmla z19.s, z6.s, z2.s[3]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "fmla z28.s, z6.s, z3.s[0]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "fmla z29.s, z6.s, z3.s[1]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "fmla z30.s, z6.s, z3.s[2]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "fmla z31.s, z6.s, z3.s[3]\n"
+                "4:\n"
+                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+                "st1w z25.s, p0, [%[c_ptr]]\n"
+                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #8\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
new file mode 100644
index 0000000..91aa567
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+class interleaved_s8s32_dot_3VLx8 {
+public:
+    typedef int8_t operand_type;
+    typedef int32_t result_type;
+
+    typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int);
+
+    /* Kernel blocking parameters */
+    static int out_width()
+    {
+        return svcntw() * 3;
+    }
+
+    static int out_height()
+    {
+        return 8;
+    }
+
+    static int k_unroll()
+    {
+        return 4;
+    }
+
+    // Use the standard fixed size transforms.
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+
+    kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+
+    interleaved_s8s32_dot_3VLx8(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
new file mode 100644
index 0000000..2e994a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp

@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *a_ptr = Apanel;
+    int32_t *c_ptr = Cpanel;
+
+    K /= 4;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "mov z8.s, #0\n"
+                "ptrue p0.b\n"
+                "mov z9.s, #0\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "mov z10.s, #0\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "mov z11.s, #0\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "mov z12.s, #0\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "mov z13.s, #0\n"
+                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+                "mov z14.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "mov z15.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z16.s, #0\n"
+                "mov z17.s, #0\n"
+                "mov z18.s, #0\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
+                "mov z21.s, #0\n"
+                "mov z22.s, #0\n"
+                "mov z23.s, #0\n"
+                "mov z24.s, #0\n"
+                "mov z25.s, #0\n"
+                "mov z26.s, #0\n"
+                "mov z27.s, #0\n"
+                "mov z28.s, #0\n"
+                "mov z29.s, #0\n"
+                "mov z30.s, #0\n"
+                "mov z31.s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                "sdot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "sdot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "sdot z10.s, z4.b, z0.b[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                "sdot z11.s, z4.b, z0.b[3]\n"
+                "sdot z20.s, z4.b, z1.b[0]\n"
+                "sdot z21.s, z4.b, z1.b[1]\n"
+                "sdot z22.s, z4.b, z1.b[2]\n"
+                "sdot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "sdot z12.s, z5.b, z0.b[0]\n"
+                "sdot z13.s, z5.b, z0.b[1]\n"
+                "sdot z14.s, z5.b, z0.b[2]\n"
+                "sdot z15.s, z5.b, z0.b[3]\n"
+                "sdot z24.s, z5.b, z1.b[0]\n"
+                "sdot z25.s, z5.b, z1.b[1]\n"
+                "sdot z26.s, z5.b, z1.b[2]\n"
+                "sdot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "sdot z16.s, z6.b, z0.b[0]\n"
+                "sdot z17.s, z6.b, z0.b[1]\n"
+                "sdot z18.s, z6.b, z0.b[2]\n"
+                "sdot z19.s, z6.b, z0.b[3]\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "sdot z28.s, z6.b, z1.b[0]\n"
+                "sdot z29.s, z6.b, z1.b[1]\n"
+                "sdot z30.s, z6.b, z1.b[2]\n"
+                "sdot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "sdot z8.s, z4.b, z2.b[0]\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "sdot z9.s, z4.b, z2.b[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "sdot z10.s, z4.b, z2.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "sdot z11.s, z4.b, z2.b[3]\n"
+                "sdot z20.s, z4.b, z3.b[0]\n"
+                "sdot z21.s, z4.b, z3.b[1]\n"
+                "sdot z22.s, z4.b, z3.b[2]\n"
+                "sdot z23.s, z4.b, z3.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "sdot z12.s, z5.b, z2.b[0]\n"
+                "sdot z13.s, z5.b, z2.b[1]\n"
+                "sdot z14.s, z5.b, z2.b[2]\n"
+                "sdot z15.s, z5.b, z2.b[3]\n"
+                "sdot z24.s, z5.b, z3.b[0]\n"
+                "sdot z25.s, z5.b, z3.b[1]\n"
+                "sdot z26.s, z5.b, z3.b[2]\n"
+                "sdot z27.s, z5.b, z3.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "sdot z16.s, z6.b, z2.b[0]\n"
+                "sdot z17.s, z6.b, z2.b[1]\n"
+                "sdot z18.s, z6.b, z2.b[2]\n"
+                "sdot z19.s, z6.b, z2.b[3]\n"
+                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
+                "sdot z28.s, z6.b, z3.b[0]\n"
+                "sdot z29.s, z6.b, z3.b[1]\n"
+                "sdot z30.s, z6.b, z3.b[2]\n"
+                "sdot z31.s, z6.b, z3.b[3]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                "sdot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "sdot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "sdot z10.s, z4.b, z0.b[2]\n"
+                "sdot z11.s, z4.b, z0.b[3]\n"
+                "sdot z20.s, z4.b, z1.b[0]\n"
+                "sdot z21.s, z4.b, z1.b[1]\n"
+                "sdot z22.s, z4.b, z1.b[2]\n"
+                "sdot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "sdot z12.s, z5.b, z0.b[0]\n"
+                "sdot z13.s, z5.b, z0.b[1]\n"
+                "sdot z14.s, z5.b, z0.b[2]\n"
+                "sdot z15.s, z5.b, z0.b[3]\n"
+                "sdot z24.s, z5.b, z1.b[0]\n"
+                "sdot z25.s, z5.b, z1.b[1]\n"
+                "sdot z26.s, z5.b, z1.b[2]\n"
+                "sdot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "sdot z16.s, z6.b, z0.b[0]\n"
+                "sdot z17.s, z6.b, z0.b[1]\n"
+                "sdot z18.s, z6.b, z0.b[2]\n"
+                "sdot z19.s, z6.b, z0.b[3]\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "sdot z28.s, z6.b, z1.b[0]\n"
+                "sdot z29.s, z6.b, z1.b[1]\n"
+                "sdot z30.s, z6.b, z1.b[2]\n"
+                "sdot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "sdot z8.s, z4.b, z2.b[0]\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "sdot z9.s, z4.b, z2.b[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "sdot z10.s, z4.b, z2.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "sdot z11.s, z4.b, z2.b[3]\n"
+                "sdot z20.s, z4.b, z3.b[0]\n"
+                "sdot z21.s, z4.b, z3.b[1]\n"
+                "sdot z22.s, z4.b, z3.b[2]\n"
+                "sdot z23.s, z4.b, z3.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "sdot z12.s, z5.b, z2.b[0]\n"
+                "sdot z13.s, z5.b, z2.b[1]\n"
+                "sdot z14.s, z5.b, z2.b[2]\n"
+                "sdot z15.s, z5.b, z2.b[3]\n"
+                "sdot z24.s, z5.b, z3.b[0]\n"
+                "sdot z25.s, z5.b, z3.b[1]\n"
+                "sdot z26.s, z5.b, z3.b[2]\n"
+                "sdot z27.s, z5.b, z3.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "sdot z16.s, z6.b, z2.b[0]\n"
+                "sdot z17.s, z6.b, z2.b[1]\n"
+                "sdot z18.s, z6.b, z2.b[2]\n"
+                "sdot z19.s, z6.b, z2.b[3]\n"
+                "sdot z28.s, z6.b, z3.b[0]\n"
+                "sdot z29.s, z6.b, z3.b[1]\n"
+                "sdot z30.s, z6.b, z3.b[2]\n"
+                "sdot z31.s, z6.b, z3.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "sdot z8.s, z4.b, z0.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "sdot z9.s, z4.b, z0.b[1]\n"
+                "sdot z10.s, z4.b, z0.b[2]\n"
+                "sdot z11.s, z4.b, z0.b[3]\n"
+                "sdot z20.s, z4.b, z1.b[0]\n"
+                "sdot z21.s, z4.b, z1.b[1]\n"
+                "sdot z22.s, z4.b, z1.b[2]\n"
+                "sdot z23.s, z4.b, z1.b[3]\n"
+                "sdot z12.s, z5.b, z0.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "sdot z13.s, z5.b, z0.b[1]\n"
+                "sdot z14.s, z5.b, z0.b[2]\n"
+                "sdot z15.s, z5.b, z0.b[3]\n"
+                "sdot z24.s, z5.b, z1.b[0]\n"
+                "sdot z25.s, z5.b, z1.b[1]\n"
+                "sdot z26.s, z5.b, z1.b[2]\n"
+                "sdot z27.s, z5.b, z1.b[3]\n"
+                "sdot z16.s, z6.b, z0.b[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "sdot z17.s, z6.b, z0.b[1]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "sdot z18.s, z6.b, z0.b[2]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "sdot z19.s, z6.b, z0.b[3]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "sdot z28.s, z6.b, z1.b[0]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "sdot z29.s, z6.b, z1.b[1]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "sdot z30.s, z6.b, z1.b[2]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "sdot z31.s, z6.b, z1.b[3]\n"
+                "b 4f\n"
+                "3:\n"
+                "sdot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "sdot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "sdot z10.s, z4.b, z0.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "sdot z11.s, z4.b, z0.b[3]\n"
+                "sdot z20.s, z4.b, z1.b[0]\n"
+                "sdot z21.s, z4.b, z1.b[1]\n"
+                "sdot z22.s, z4.b, z1.b[2]\n"
+                "sdot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "sdot z12.s, z5.b, z0.b[0]\n"
+                "sdot z13.s, z5.b, z0.b[1]\n"
+                "sdot z14.s, z5.b, z0.b[2]\n"
+                "sdot z15.s, z5.b, z0.b[3]\n"
+                "sdot z24.s, z5.b, z1.b[0]\n"
+                "sdot z25.s, z5.b, z1.b[1]\n"
+                "sdot z26.s, z5.b, z1.b[2]\n"
+                "sdot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "sdot z16.s, z6.b, z0.b[0]\n"
+                "sdot z17.s, z6.b, z0.b[1]\n"
+                "sdot z18.s, z6.b, z0.b[2]\n"
+                "sdot z19.s, z6.b, z0.b[3]\n"
+                "sdot z28.s, z6.b, z1.b[0]\n"
+                "sdot z29.s, z6.b, z1.b[1]\n"
+                "sdot z30.s, z6.b, z1.b[2]\n"
+                "sdot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "sdot z8.s, z4.b, z2.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "sdot z9.s, z4.b, z2.b[1]\n"
+                "sdot z10.s, z4.b, z2.b[2]\n"
+                "sdot z11.s, z4.b, z2.b[3]\n"
+                "sdot z20.s, z4.b, z3.b[0]\n"
+                "sdot z21.s, z4.b, z3.b[1]\n"
+                "sdot z22.s, z4.b, z3.b[2]\n"
+                "sdot z23.s, z4.b, z3.b[3]\n"
+                "sdot z12.s, z5.b, z2.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "sdot z13.s, z5.b, z2.b[1]\n"
+                "sdot z14.s, z5.b, z2.b[2]\n"
+                "sdot z15.s, z5.b, z2.b[3]\n"
+                "sdot z24.s, z5.b, z3.b[0]\n"
+                "sdot z25.s, z5.b, z3.b[1]\n"
+                "sdot z26.s, z5.b, z3.b[2]\n"
+                "sdot z27.s, z5.b, z3.b[3]\n"
+                "sdot z16.s, z6.b, z2.b[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "sdot z17.s, z6.b, z2.b[1]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "sdot z18.s, z6.b, z2.b[2]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "sdot z19.s, z6.b, z2.b[3]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "sdot z28.s, z6.b, z3.b[0]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "sdot z29.s, z6.b, z3.b[1]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "sdot z30.s, z6.b, z3.b[2]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "sdot z31.s, z6.b, z3.b[3]\n"
+                "4:\n"
+                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+                "st1w z25.s, p0, [%[c_ptr]]\n"
+                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #8\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
new file mode 100644
index 0000000..ef457e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../std_transforms_sve.hpp"
+
+namespace arm_gemm {
+
+// Actual kernel implementations
+void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+class interleaved_u8u32_dot_3VLx8 {
+public:
+    typedef uint8_t operand_type;
+    typedef uint32_t result_type;
+
+    typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+
+    /* Kernel blocking parameters */
+    static int out_width()
+    {
+        return svcntw() * 3;
+    }
+
+    static int out_height()
+    {
+        return 8;
+    }
+
+    static int k_unroll()
+    {
+        return 4;
+    }
+
+    // Use the standard fixed size transforms.
+    StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+
+    kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+
+    interleaved_u8u32_dot_3VLx8(const CPUInfo *ci)
+    {
+
+    }
+};
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
new file mode 100644
index 0000000..f4d33a9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp

@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <cstdint>
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *a_ptr = Apanel;
+    uint32_t *c_ptr = Cpanel;
+
+    K /= 4;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "mov z8.s, #0\n"
+                "ptrue p0.b\n"
+                "mov z9.s, #0\n"
+                "mov z10.s, #0\n"
+                "mov z11.s, #0\n"
+                "mov z12.s, #0\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "mov z13.s, #0\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "mov z14.s, #0\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "mov z15.s, #0\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "mov z16.s, #0\n"
+                "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
+                "mov z17.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "mov z18.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
+                "mov z21.s, #0\n"
+                "mov z22.s, #0\n"
+                "mov z23.s, #0\n"
+                "mov z24.s, #0\n"
+                "mov z25.s, #0\n"
+                "mov z26.s, #0\n"
+                "mov z27.s, #0\n"
+                "mov z28.s, #0\n"
+                "mov z29.s, #0\n"
+                "mov z30.s, #0\n"
+                "mov z31.s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                "udot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "udot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "udot z10.s, z4.b, z0.b[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                "udot z11.s, z4.b, z0.b[3]\n"
+                "udot z20.s, z4.b, z1.b[0]\n"
+                "udot z21.s, z4.b, z1.b[1]\n"
+                "udot z22.s, z4.b, z1.b[2]\n"
+                "udot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "udot z12.s, z5.b, z0.b[0]\n"
+                "udot z13.s, z5.b, z0.b[1]\n"
+                "udot z14.s, z5.b, z0.b[2]\n"
+                "udot z15.s, z5.b, z0.b[3]\n"
+                "udot z24.s, z5.b, z1.b[0]\n"
+                "udot z25.s, z5.b, z1.b[1]\n"
+                "udot z26.s, z5.b, z1.b[2]\n"
+                "udot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "udot z16.s, z6.b, z0.b[0]\n"
+                "udot z17.s, z6.b, z0.b[1]\n"
+                "udot z18.s, z6.b, z0.b[2]\n"
+                "udot z19.s, z6.b, z0.b[3]\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "udot z28.s, z6.b, z1.b[0]\n"
+                "udot z29.s, z6.b, z1.b[1]\n"
+                "udot z30.s, z6.b, z1.b[2]\n"
+                "udot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "udot z8.s, z4.b, z2.b[0]\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "udot z9.s, z4.b, z2.b[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "udot z10.s, z4.b, z2.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "udot z11.s, z4.b, z2.b[3]\n"
+                "udot z20.s, z4.b, z3.b[0]\n"
+                "udot z21.s, z4.b, z3.b[1]\n"
+                "udot z22.s, z4.b, z3.b[2]\n"
+                "udot z23.s, z4.b, z3.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "udot z12.s, z5.b, z2.b[0]\n"
+                "udot z13.s, z5.b, z2.b[1]\n"
+                "udot z14.s, z5.b, z2.b[2]\n"
+                "udot z15.s, z5.b, z2.b[3]\n"
+                "udot z24.s, z5.b, z3.b[0]\n"
+                "udot z25.s, z5.b, z3.b[1]\n"
+                "udot z26.s, z5.b, z3.b[2]\n"
+                "udot z27.s, z5.b, z3.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "udot z16.s, z6.b, z2.b[0]\n"
+                "udot z17.s, z6.b, z2.b[1]\n"
+                "udot z18.s, z6.b, z2.b[2]\n"
+                "udot z19.s, z6.b, z2.b[3]\n"
+                "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
+                "udot z28.s, z6.b, z3.b[0]\n"
+                "udot z29.s, z6.b, z3.b[1]\n"
+                "udot z30.s, z6.b, z3.b[2]\n"
+                "udot z31.s, z6.b, z3.b[3]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                "udot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "udot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "udot z10.s, z4.b, z0.b[2]\n"
+                "udot z11.s, z4.b, z0.b[3]\n"
+                "udot z20.s, z4.b, z1.b[0]\n"
+                "udot z21.s, z4.b, z1.b[1]\n"
+                "udot z22.s, z4.b, z1.b[2]\n"
+                "udot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
+                "udot z12.s, z5.b, z0.b[0]\n"
+                "udot z13.s, z5.b, z0.b[1]\n"
+                "udot z14.s, z5.b, z0.b[2]\n"
+                "udot z15.s, z5.b, z0.b[3]\n"
+                "udot z24.s, z5.b, z1.b[0]\n"
+                "udot z25.s, z5.b, z1.b[1]\n"
+                "udot z26.s, z5.b, z1.b[2]\n"
+                "udot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "udot z16.s, z6.b, z0.b[0]\n"
+                "udot z17.s, z6.b, z0.b[1]\n"
+                "udot z18.s, z6.b, z0.b[2]\n"
+                "udot z19.s, z6.b, z0.b[3]\n"
+                "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
+                "udot z28.s, z6.b, z1.b[0]\n"
+                "udot z29.s, z6.b, z1.b[1]\n"
+                "udot z30.s, z6.b, z1.b[2]\n"
+                "udot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "udot z8.s, z4.b, z2.b[0]\n"
+                "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
+                "udot z9.s, z4.b, z2.b[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "udot z10.s, z4.b, z2.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #6\n"
+                "udot z11.s, z4.b, z2.b[3]\n"
+                "udot z20.s, z4.b, z3.b[0]\n"
+                "udot z21.s, z4.b, z3.b[1]\n"
+                "udot z22.s, z4.b, z3.b[2]\n"
+                "udot z23.s, z4.b, z3.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "udot z12.s, z5.b, z2.b[0]\n"
+                "udot z13.s, z5.b, z2.b[1]\n"
+                "udot z14.s, z5.b, z2.b[2]\n"
+                "udot z15.s, z5.b, z2.b[3]\n"
+                "udot z24.s, z5.b, z3.b[0]\n"
+                "udot z25.s, z5.b, z3.b[1]\n"
+                "udot z26.s, z5.b, z3.b[2]\n"
+                "udot z27.s, z5.b, z3.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "udot z16.s, z6.b, z2.b[0]\n"
+                "udot z17.s, z6.b, z2.b[1]\n"
+                "udot z18.s, z6.b, z2.b[2]\n"
+                "udot z19.s, z6.b, z2.b[3]\n"
+                "udot z28.s, z6.b, z3.b[0]\n"
+                "udot z29.s, z6.b, z3.b[1]\n"
+                "udot z30.s, z6.b, z3.b[2]\n"
+                "udot z31.s, z6.b, z3.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "udot z8.s, z4.b, z0.b[0]\n"
+                "udot z9.s, z4.b, z0.b[1]\n"
+                "udot z10.s, z4.b, z0.b[2]\n"
+                "udot z11.s, z4.b, z0.b[3]\n"
+                "udot z20.s, z4.b, z1.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "udot z21.s, z4.b, z1.b[1]\n"
+                "udot z22.s, z4.b, z1.b[2]\n"
+                "udot z23.s, z4.b, z1.b[3]\n"
+                "udot z12.s, z5.b, z0.b[0]\n"
+                "udot z13.s, z5.b, z0.b[1]\n"
+                "udot z14.s, z5.b, z0.b[2]\n"
+                "udot z15.s, z5.b, z0.b[3]\n"
+                "udot z24.s, z5.b, z1.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "udot z25.s, z5.b, z1.b[1]\n"
+                "udot z26.s, z5.b, z1.b[2]\n"
+                "udot z27.s, z5.b, z1.b[3]\n"
+                "udot z16.s, z6.b, z0.b[0]\n"
+                "udot z17.s, z6.b, z0.b[1]\n"
+                "udot z18.s, z6.b, z0.b[2]\n"
+                "udot z19.s, z6.b, z0.b[3]\n"
+                "udot z28.s, z6.b, z1.b[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "udot z29.s, z6.b, z1.b[1]\n"
+                "udot z30.s, z6.b, z1.b[2]\n"
+                "udot z31.s, z6.b, z1.b[3]\n"
+                "b 4f\n"
+                "3:\n"
+                "udot z8.s, z4.b, z0.b[0]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "udot z9.s, z4.b, z0.b[1]\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
+                "udot z10.s, z4.b, z0.b[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "udot z11.s, z4.b, z0.b[3]\n"
+                "udot z20.s, z4.b, z1.b[0]\n"
+                "udot z21.s, z4.b, z1.b[1]\n"
+                "udot z22.s, z4.b, z1.b[2]\n"
+                "udot z23.s, z4.b, z1.b[3]\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                "udot z12.s, z5.b, z0.b[0]\n"
+                "udot z13.s, z5.b, z0.b[1]\n"
+                "udot z14.s, z5.b, z0.b[2]\n"
+                "udot z15.s, z5.b, z0.b[3]\n"
+                "udot z24.s, z5.b, z1.b[0]\n"
+                "udot z25.s, z5.b, z1.b[1]\n"
+                "udot z26.s, z5.b, z1.b[2]\n"
+                "udot z27.s, z5.b, z1.b[3]\n"
+                "ld1b z5.b, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                "udot z16.s, z6.b, z0.b[0]\n"
+                "udot z17.s, z6.b, z0.b[1]\n"
+                "udot z18.s, z6.b, z0.b[2]\n"
+                "udot z19.s, z6.b, z0.b[3]\n"
+                "udot z28.s, z6.b, z1.b[0]\n"
+                "udot z29.s, z6.b, z1.b[1]\n"
+                "udot z30.s, z6.b, z1.b[2]\n"
+                "udot z31.s, z6.b, z1.b[3]\n"
+                "ld1b z6.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                "udot z8.s, z4.b, z2.b[0]\n"
+                "udot z9.s, z4.b, z2.b[1]\n"
+                "udot z10.s, z4.b, z2.b[2]\n"
+                "udot z11.s, z4.b, z2.b[3]\n"
+                "udot z20.s, z4.b, z3.b[0]\n"
+                "st1w z8.s, p0, [%[c_ptr]]\n"
+                "udot z21.s, z4.b, z3.b[1]\n"
+                "udot z22.s, z4.b, z3.b[2]\n"
+                "udot z23.s, z4.b, z3.b[3]\n"
+                "udot z12.s, z5.b, z2.b[0]\n"
+                "udot z13.s, z5.b, z2.b[1]\n"
+                "udot z14.s, z5.b, z2.b[2]\n"
+                "udot z15.s, z5.b, z2.b[3]\n"
+                "udot z24.s, z5.b, z3.b[0]\n"
+                "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "udot z25.s, z5.b, z3.b[1]\n"
+                "udot z26.s, z5.b, z3.b[2]\n"
+                "udot z27.s, z5.b, z3.b[3]\n"
+                "udot z16.s, z6.b, z2.b[0]\n"
+                "udot z17.s, z6.b, z2.b[1]\n"
+                "udot z18.s, z6.b, z2.b[2]\n"
+                "udot z19.s, z6.b, z2.b[3]\n"
+                "udot z28.s, z6.b, z3.b[0]\n"
+                "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "udot z29.s, z6.b, z3.b[1]\n"
+                "udot z30.s, z6.b, z3.b[2]\n"
+                "udot z31.s, z6.b, z3.b[3]\n"
+                "4:\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z14.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "st1w z18.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+                "st1w z11.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+                "st1w z15.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+                "st1w z19.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+                "st1w z20.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+                "st1w z24.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+                "st1w z28.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+                "st1w z21.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+                "st1w z25.s, p0, [%[c_ptr]]\n"
+                "st1w z29.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "st1w z22.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "st1w z26.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z30.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z23.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z27.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z31.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #8\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
index ee32ce7..35d4cc5 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_int32_12x8.hpp

@@ -273,7 +273,7 @@
 template<>
 inline void MergeResults<12, 8>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t alpha, const uint32_t beta) {
   // Since the above code uses only MUL and MLA instructions discard the "unsignedness" and proceed safely.
-  MergeResults<12, 8>(reinterpret_cast<int32_t*>(out), reinterpret_cast<const int32_t*>(in), ldout, y0, ymax, x0, xmax, static_cast<const int32_t>(alpha), static_cast<const int32_t>(beta));
+  MergeResults<12, 8>(reinterpret_cast<int32_t*>(out), reinterpret_cast<const int32_t*>(in), ldout, y0, ymax, x0, xmax, static_cast<int32_t>(alpha), static_cast<int32_t>(beta));
 }
 
 #endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index d93f1b0..181d1a4 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp

@@ -26,3 +26,5 @@
 #include "a64_merge_float_to_half_12x8.hpp"
 #include "a64_merge_half_24x8.hpp"
 #include "a64_merge_int32_12x8.hpp"
+#include "sve_merge_fp32_2VLx8.hpp"
+#include "sve_merge_fp32_3VLx8.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp
new file mode 100644
index 0000000..7479c8d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_2VLx8.hpp

@@ -0,0 +1,1208 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+inline void MergeResults<2, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+
+    for (int y=y0; y<ymax; y+=8) {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        const int height = ymax - y;
+
+        for (int i=x0; i<xmax; i+=(2 * get_vector_length<float>())) {
+            if (beta==0.0f)
+            {
+                switch(height) {
+                case 1:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 2:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 3:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 4:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 5:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 6:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 7:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x40]\n"
+                            "addvl %[outptr6], %[outptr6], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                default:
+                case 8:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x40]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x40]\n"
+                            "addvl %[outptr6], %[outptr6], #2\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x40]\n"
+                            "addvl %[outptr7], %[outptr7], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                
+                }
+            }
+            else
+            {
+                switch(height) {
+                case 1:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 2:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 3:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 4:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 5:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 6:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 7:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr6], #0x40]\n"
+                            "addvl %[outptr6], %[outptr6], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                default:
+                case 8:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x100]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x140]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr7]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x40]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #2\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x40]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #2\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x40]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr7], #1, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #2\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x40]\n"
+                            "addvl %[outptr4], %[outptr4], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x40]\n"
+                            "addvl %[outptr5], %[outptr5], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr6], #0x40]\n"
+                            "addvl %[outptr6], %[outptr6], #2\n"
+                            "prfm PLDL1KEEP, [%[outptr7], #0x40]\n"
+                            "addvl %[outptr7], %[outptr7], #2\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #16\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                
+                }
+            }
+        }
+    }
+}
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
new file mode 100644
index 0000000..27084c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp

@@ -0,0 +1,1564 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+inline void MergeResults<3, 8, true>(float *out, const float *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const float alpha, const float beta)
+{
+    const float *inptr = in;
+
+    for (int y=y0; y<ymax; y+=8) {
+        float *outptr0 = out + (y * ldout) + x0;
+        float *outptr1 = outptr0 + ldout;
+        float *outptr2 = outptr1 + ldout;
+        float *outptr3 = outptr2 + ldout;
+        float *outptr4 = outptr3 + ldout;
+        float *outptr5 = outptr4 + ldout;
+        float *outptr6 = outptr5 + ldout;
+        float *outptr7 = outptr6 + ldout;
+
+        const int height = ymax - y;
+
+        for (int i=x0; i<xmax; i+=(3 * get_vector_length<float>())) {
+            if (beta==0.0f)
+            {
+                switch(height) {
+                case 1:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 2:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 3:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 4:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 5:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 6:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 7:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "addvl %[outptr6], %[outptr6], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                default:
+                case 8:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z8.s, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z9.s, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmul z10.s, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
+                            "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
+                            "prfm PSTL1KEEP, [%[outptr3], #0x60]\n"
+                            "fmul z11.s, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PSTL1KEEP, [%[outptr6], #0x60]\n"
+                            "addvl %[outptr6], %[outptr6], #3\n"
+                            "prfm PSTL1KEEP, [%[outptr7], #0x60]\n"
+                            "addvl %[outptr7], %[outptr7], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                
+                }
+            }
+            else
+            {
+                switch(height) {
+                case 1:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 2:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 3:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 4:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 5:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z9.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 6:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                case 7:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z11.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z10.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #1, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr6], #2, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr6], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+                            "addvl %[outptr6], %[outptr6], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                default:
+                case 8:
+                    {
+                        long w = xmax - i;
+                        long p = 0;
+                        /* Optimized routine to copy an entire block */
+                        __asm __volatile (
+                            "mov z2.s, %s[alpha]\n"
+                            "addvl x8, %[inptr], #16\n"
+                            "mov z3.s, %s[beta]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0]]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr]]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x180]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #3, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2]]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x240]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #6, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3]]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4]]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-4, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4]]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5]]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #-1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5]]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6]]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #2, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6]]\n"
+                            "ld1w z11.s, p0/z, [%[outptr7]]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7]]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #1, MUL VL]\n"
+                            "incw %[p], all, mul #1\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #1, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x1c0]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #4, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #1, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x280]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [%[inptr], #7, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #1, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4], #1, MUL VL]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-3, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #1, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5], #1, MUL VL]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #1, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6], #1, MUL VL]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #3, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #1, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr7], #1, MUL VL]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #6, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #1, MUL VL]\n"
+                            "whilelt p0.s, %[p], %[w]\n"
+                            "b.none 1f\n"
+                            "ld1w z8.s, p0/z, [%[outptr0], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr0], #0x60]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [%[inptr], #2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr0], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr1], #2, MUL VL]\n"
+                            "addvl %[outptr0], %[outptr0], #3\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [%[inptr], #5, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr1], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr2], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr1], #0x60]\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #-8, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr2], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr3], #2, MUL VL]\n"
+                            "addvl %[outptr1], %[outptr1], #3\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #-5, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr3], #2, MUL VL]\n"
+                            "ld1w z8.s, p0/z, [%[outptr4], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x200]\n"
+                            "fmul z8.s, z8.s, z3.s\n"
+                            "ld1w z4.s, p0/z, [x8, #-2, MUL VL]\n"
+                            "fmla z8.s, p0/m, z4.s, z2.s\n"
+                            "st1w z8.s, p0, [%[outptr4], #2, MUL VL]\n"
+                            "ld1w z9.s, p0/z, [%[outptr5], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr2], #0x60]\n"
+                            "fmul z9.s, z9.s, z3.s\n"
+                            "ld1w z5.s, p0/z, [x8, #1, MUL VL]\n"
+                            "fmla z9.s, p0/m, z5.s, z2.s\n"
+                            "st1w z9.s, p0, [%[outptr5], #2, MUL VL]\n"
+                            "ld1w z10.s, p0/z, [%[outptr6], #2, MUL VL]\n"
+                            "addvl %[outptr2], %[outptr2], #3\n"
+                            "fmul z10.s, z10.s, z3.s\n"
+                            "ld1w z6.s, p0/z, [x8, #4, MUL VL]\n"
+                            "fmla z10.s, p0/m, z6.s, z2.s\n"
+                            "st1w z10.s, p0, [%[outptr6], #2, MUL VL]\n"
+                            "ld1w z11.s, p0/z, [%[outptr7], #2, MUL VL]\n"
+                            "prfm PLDL1KEEP, [%[outptr3], #0x60]\n"
+                            "fmul z11.s, z11.s, z3.s\n"
+                            "ld1w z7.s, p0/z, [x8, #7, MUL VL]\n"
+                            "fmla z11.s, p0/m, z7.s, z2.s\n"
+                            "st1w z11.s, p0, [%[outptr7], #2, MUL VL]\n"
+                            "addvl %[outptr3], %[outptr3], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr4], #0x60]\n"
+                            "addvl %[outptr4], %[outptr4], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr5], #0x60]\n"
+                            "addvl %[outptr5], %[outptr5], #3\n"
+                            "prfm PLDL1KEEP, [%[inptr], #0x2c0]\n"
+                            "prfm PLDL1KEEP, [%[outptr6], #0x60]\n"
+                            "addvl %[outptr6], %[outptr6], #3\n"
+                            "prfm PLDL1KEEP, [%[outptr7], #0x60]\n"
+                            "addvl %[outptr7], %[outptr7], #3\n"
+                            "1:\n"
+                            "addvl %[inptr], %[inptr], #24\n"
+                        : [outptr0] "+r" (outptr0), [outptr1] "+r" (outptr1), [outptr2] "+r" (outptr2), [outptr3] "+r" (outptr3), [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7),
+                          [inptr] "+r" (inptr), [p] "+r" (p)
+                        : [alpha] "w" (alpha), [beta] "w" (beta), [w] "r" (w)
+                        : "x8", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "memory", "cc"
+                        );
+                    }
+                    break;
+                
+                
+                }
+            }
+        }
+    }
+}
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
new file mode 100644
index 0000000..b7323eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Define "standard" transforms for the blocked GEMMs for SVE.
+ *
+ * This assumes that A is interleaved 'height' ways, B is interleaved
+ * 'width'xVL ways and transposed, and that the merge needs to work in
+ * 'height' x 'width'xVL blocks.
+ *
+ * The optional 'block' parameter is for kernels using dot-product type
+ * instructions like UDOT and SDOT.
+ */
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1>
+class StdTransformsSVE
+{
+public:
+    template<typename TIn>
+    void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
+                  const int ymax, const int k0, const int kmax, bool transposed) {
+        if (transposed) {
+            Transform<height, block,  true>(out, in, stride, y0, ymax, k0, kmax);
+        } else {
+            Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+        }
+    }
+
+    template<typename TIn>
+    void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
+                  const int xmax, const int k0, const int kmax, bool transposed) {
+        if (transposed) {
+            Transform<width_vectors, block, false, true>(out, in, stride, x0, xmax, k0, kmax);
+        } else {
+            Transform<width_vectors, block,  true, true>(out, in, stride, x0, xmax, k0, kmax);
+        }
+    }
+
+    template<typename TOut>
+    void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut alpha, const TOut beta) {
+        MergeResults<width_vectors / mmla, height, true>(out, in, stride, y0, ymax, x0, xmax, alpha, beta);
+    }
+};
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index 77d0d87..e422b91 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp

@@ -40,7 +40,7 @@
     static void Transform(TOut* out, const TIn* const in, const int stride,
                           const int y0, const int ymax, const int x0, const int xmax) {
         // For SVE cases we multiply the interleave factor by the vector length.
-        const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() : 1);
+        const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() / BlockBy : 1);
 
         const int n_whole_y_blocks = (ymax - y0) / IntBy;
         const int y_remainders = (ymax - y0) % IntBy;

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 492abe5..1ccdf60 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp

@@ -35,7 +35,7 @@
     uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
     const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
 
-    uint32_t zerobuff[8];
+    uint32_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
 
     for (int y=y0; y<ymax; y+=6) {
         const uint32_t *inptr0 = inptr + y * ldin + k0;
@@ -137,7 +137,7 @@
                 : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
                   [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
                 :
-                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
+                : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
             );
         }
 

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
index 91ee492..500ed78 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp

@@ -35,7 +35,7 @@
     uint16_t *outptr = (uint16_t *)out;
     const uint16_t *inptr = (const uint16_t *)in;
 
-    uint16_t zerobuff[24];
+    uint16_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
 
     for (int y=y0; y<ymax; y+=8) {
         const uint16_t *inptr0 = inptr + y * ldin + k0;
@@ -147,7 +147,7 @@
                 : [skippf] "r" (skippf)
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
                   "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-                  "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                  "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory"
             );
         }
 

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 7a32f33..347eafb 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp

@@ -35,7 +35,7 @@
     uint32_t *outptr = (uint32_t *)out;
     const uint32_t *inptr = (uint32_t *)in;
 
-    uint32_t zerobuff[8];
+    uint32_t zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
 
     for (int y=y0; y<ymax; y+=8) {
         const uint32_t *inptr0 = inptr + y * ldin + k0;
@@ -156,7 +156,7 @@
                   [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
                 :
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
             );
         }
 

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
index 773d56d..88b40d7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp

@@ -35,7 +35,7 @@
     float *outptr = out;
     const __fp16 *inptr = in;
 
-    __fp16 zerobuff[8];
+    __fp16 zerobuff[16]; // 8 for asm loop plus up to 7 for overflow loop
 
     for (int y=y0; y<ymax; y+=8) {
         const __fp16 *inptr0 = inptr + y * ldin + k0;
@@ -172,7 +172,7 @@
                   [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
                 :
                 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
             );
         }
 

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index 8ad5b85..fc1f2c2 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp

@@ -23,9 +23,15 @@
  */
 #include "a32_interleave_6way_32bit.hpp"
 #include "a32_transpose_interleave_8way_32bit.hpp"
+#ifdef __ARM_FEATURE_SVE
+#include "sve_interleave_8way_32bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
+#include "sve_interleave_8way_block4_8bit.hpp"
+#else
+#include "a64_interleave_8way_32bit.hpp"
+#endif
 #include "a64_block16_interleave4_8bit.hpp"
 #include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
 #include "a64_interleave_8way_half_to_float.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
new file mode 100644
index 0000000..752e837
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp

@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
+    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+    for (int y=y0; y<ymax; y+=8)
+    {
+        const int height = ymax-y;
+        const long inwidth = (kmax - k0);
+        const long outwidth = inwidth * 8;
+        long inpos = 0;
+        long outpos = 0;
+
+        uint32_t *outptr = master_outptr;
+        master_outptr += outwidth;
+
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        switch(height)
+        {
+            case 1:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z0.s, z8.s, z4.s\n"
+                    "zip2 z1.s, z8.s, z4.s\n"
+                    "zip1 z2.s, z9.s, z4.s\n"
+                    "zip2 z3.s, z9.s, z4.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z4.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z4.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 2:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "mov z14.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z0.s, z8.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z1.s, z8.s, z4.s\n"
+                    "zip1 z2.s, z9.s, z4.s\n"
+                    "zip2 z3.s, z9.s, z4.s\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z6.s, z11.s, z14.s\n"
+                    "zip2 z7.s, z11.s, z14.s\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 3:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "mov z14.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z14.s\n"
+                    "zip2 z7.s, z11.s, z14.s\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 4:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z4.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 5:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z5.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z5.s\n"
+                    "zip2 z13.s, z2.s, z5.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z5.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z5.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 6:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z6.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z6.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 7:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z7.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            default:
+            case 8:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+
+        }
+    }
+}
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
new file mode 100644
index 0000000..4cc4311
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp

@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
+    const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
+
+    for (int y=y0; y<ymax; y+=8)
+    {
+        const int height = ymax-y;
+        const long inwidth = (kmax - k0);
+        const long outwidth = (inwidth * 8 + 1) / 2;
+        long inpos = 0;
+        long outpos = 0;
+
+        uint32_t *outptr = master_outptr;
+        master_outptr += (outwidth * 2);
+
+        const uint32_t *inptr0 = inptr + y * ldin + k0;
+        const uint32_t *inptr1 = inptr0 + ldin;
+        const uint32_t *inptr2 = inptr1 + ldin;
+        const uint32_t *inptr3 = inptr2 + ldin;
+        const uint32_t *inptr4 = inptr3 + ldin;
+        const uint32_t *inptr5 = inptr4 + ldin;
+        const uint32_t *inptr6 = inptr5 + ldin;
+        const uint32_t *inptr7 = inptr6 + ldin;
+
+        switch(height)
+        {
+            case 1:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip1 z0.d, z8.d, z4.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z4.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z13.d, z2.d, z4.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip1 z14.d, z3.d, z4.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip2 z15.d, z3.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 2:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip1 z0.d, z8.d, z4.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z4.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z4.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "mov z14.s, #0\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip1 z6.d, z11.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z7.d, z11.d, z14.d\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 3:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip1 z12.d, z2.d, z4.d\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip2 z13.d, z2.d, z4.d\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "mov z14.s, #0\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip1 z6.d, z11.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z7.d, z11.d, z14.d\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 4:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z12.d, z2.d, z4.d\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip2 z13.d, z2.d, z4.d\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip1 z14.d, z3.d, z4.d\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip2 z15.d, z3.d, z4.d\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 5:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z5.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "zip1 z12.d, z2.d, z5.d\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip2 z13.d, z2.d, z5.d\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip1 z14.d, z3.d, z5.d\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "zip2 z15.d, z3.d, z5.d\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 6:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z6.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "zip1 z14.d, z3.d, z6.d\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "zip2 z15.d, z3.d, z6.d\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            case 7:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z7.s, #0\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "addvl %[inptr6], %[inptr6], #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            default:
+            case 8:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.s, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "addvl %[inptr0], %[inptr0], #1\n"
+                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
+                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "ld1w z7.s, p0/z, [%[inptr7]]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "addvl %[inptr6], %[inptr6], #1\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "addvl %[inptr7], %[inptr7], #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z5.d\n"
+                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "zip2 z11.d, z1.d, z5.d\n"
+                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "zip1 z12.d, z2.d, z6.d\n"
+                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "zip2 z15.d, z3.d, z7.d\n"
+                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
+                    "incd %[outpos], all, mul #1\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+            
+            
+        }
+    }
+}
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
new file mode 100644
index 0000000..f1690ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp

@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __ARM_FEATURE_SVE
+
+template<>
+template<typename T>
+inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
+{
+    uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
+    const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
+
+    for (int y=y0; y<ymax; y+=8)
+    {
+        const int height = ymax-y;
+        const long inwidth = (kmax - k0);
+        const long outwidth = ((inwidth + 3) / 4) * 32;
+        long inpos = 0;
+        long outpos = 0;
+
+        uint8_t *outptr = master_outptr;
+        master_outptr += outwidth;
+
+        const uint8_t *inptr0 = inptr + y * ldin + k0;
+        const uint8_t *inptr1 = inptr0 + ldin;
+        const uint8_t *inptr2 = inptr1 + ldin;
+        const uint8_t *inptr3 = inptr2 + ldin;
+        const uint8_t *inptr4 = inptr3 + ldin;
+        const uint8_t *inptr5 = inptr4 + ldin;
+        const uint8_t *inptr6 = inptr5 + ldin;
+        const uint8_t *inptr7 = inptr6 + ldin;
+
+        switch(height)
+        {
+            case 1:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z0.s, z8.s, z4.s\n"
+                    "zip2 z1.s, z8.s, z4.s\n"
+                    "zip1 z2.s, z9.s, z4.s\n"
+                    "zip2 z3.s, z9.s, z4.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z4.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z4.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 2:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.b, #0\n"
+                    "mov z14.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip1 z0.s, z8.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z1.s, z8.s, z4.s\n"
+                    "zip1 z2.s, z9.s, z4.s\n"
+                    "zip2 z3.s, z9.s, z4.s\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z6.s, z11.s, z14.s\n"
+                    "zip2 z7.s, z11.s, z14.s\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 3:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.b, #0\n"
+                    "mov z14.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z14.s\n"
+                    "zip2 z7.s, z11.s, z14.s\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 4:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z4.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z4.s\n"
+                    "zip2 z11.s, z1.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z4.s\n"
+                    "zip2 z13.s, z2.s, z4.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z4.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 5:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z5.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z12.s, z2.s, z5.s\n"
+                    "zip2 z13.s, z2.s, z5.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z5.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z5.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 6:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z6.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z6.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            case 7:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "mov z7.b, #0\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+            default:
+            case 8:
+                __asm __volatile(
+                    "1:\n"
+                    "whilelt p0.b, %[inpos], %[inwidth]\n"
+                    "b.none 2f\n"
+                    "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
+                    "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
+                    "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
+                    "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
+                    "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
+                    "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
+                    "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
+                    "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
+                    "incb %[inpos], all, mul #1\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "whilelt p0.b, %[outpos], %[outwidth]\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "whilelt p1.b, %[outpos], %[outwidth]\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "zip1 z0.s, z8.s, z12.s\n"
+                    "zip2 z1.s, z8.s, z12.s\n"
+                    "zip1 z2.s, z9.s, z13.s\n"
+                    "whilelt p2.b, %[outpos], %[outwidth]\n"
+                    "zip2 z3.s, z9.s, z13.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z4.s, z10.s, z14.s\n"
+                    "zip2 z5.s, z10.s, z14.s\n"
+                    "zip1 z6.s, z11.s, z15.s\n"
+                    "zip2 z7.s, z11.s, z15.s\n"
+                    "whilelt p3.b, %[outpos], %[outwidth]\n"
+                    "zip1 z8.s, z0.s, z4.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip2 z9.s, z0.s, z4.s\n"
+                    "zip1 z10.s, z1.s, z5.s\n"
+                    "zip2 z11.s, z1.s, z5.s\n"
+                    "st1b z8.b, p0, [%[outptr]]\n"
+                    "zip1 z12.s, z2.s, z6.s\n"
+                    "whilelt p4.b, %[outpos], %[outwidth]\n"
+                    "zip2 z13.s, z2.s, z6.s\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "zip1 z14.s, z3.s, z7.s\n"
+                    "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip2 z15.s, z3.s, z7.s\n"
+                    "whilelt p5.b, %[outpos], %[outwidth]\n"
+                    "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.b, %[outpos], %[outwidth]\n"
+                    "incb %[outpos], all, mul #1\n"
+                    "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
+                    "addvl %[outptr], %[outptr], #8\n"
+                    "b 1b\n"
+                    "2:\n"
+                : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
+                : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                );
+                break;
+
+
+        }
+    }
+}
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index b77bc7a..a1fc00e 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -24,6 +24,10 @@
 
 #pragma once
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+
 // Macro for unreachable code (e.g. impossible default cases on switch)
 #define UNREACHABLE(why)  __builtin_unreachable()
 
@@ -31,23 +35,27 @@
 // #define UNREACHABLE(why)   assert(0 && why)
 
 inline int iceildiv(const int a, const int b) {
-  return (a + b - 1) / b;
+    return (a + b - 1) / b;
 }
 
 template <typename T>
 inline T roundup(const T a, const T b) {
-  T rem = a % b;
+    T rem = a % b;
 
-  if (rem) {
-    return a + b - rem;
-  } else {
-    return a;
-  }
+    if (rem) {
+        return a + b - rem;
+    } else {
+        return a;
+    }
 }
 
 template <typename T>
 inline unsigned long get_vector_length() {
+#ifdef __ARM_FEATURE_SVE
+    const unsigned long length = svcntb();
+#else
     const unsigned long length = 16;
+#endif
 
     return length / sizeof(T);
-}
+}
\ No newline at end of file

diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
new file mode 100644
index 0000000..09ac08c
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/Helpers.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
+
+#include "NEGEMMInterleavedStrategies.h"
+
+namespace arm_compute
+{
+namespace
+{
+template <typename InputType, bool use_dot = false>
+BlockSizes calculate_block_sizes_template(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
+{
+    using strategy = typename Kernel<InputType, use_dot>::strategy;
+    return calculate_block_sizes<strategy>(ci, M, N, K);
+}
+} // namespace
+
+const char *get_strategy_name(DataType input_type, bool use_dot)
+{
+    switch(input_type)
+    {
+        case DataType::F32:
+            return Kernel<float>::name;
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if(use_dot)
+            {
+                return Kernel<uint8_t, true>::name;
+            }
+            else
+            {
+                return Kernel<uint8_t, false>::name;
+            }
+        case DataType::S8:
+            if(use_dot)
+            {
+                return Kernel<int8_t, true>::name;
+            }
+            else
+            {
+                return Kernel<int8_t, false>::name;
+            }
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            return Kernel<__fp16>::name;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported");
+            break;
+    }
+}
+
+BlockSizes calculate_block_sizes_from_data_type(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K, DataType input_type, bool use_dot)
+{
+    switch(input_type)
+    {
+        case DataType::F32:
+            return calculate_block_sizes_template<float>(ci, M, N, K);
+#ifdef __aarch64__
+        case DataType::U8:
+        case DataType::QASYMM8:
+            if(use_dot)
+            {
+                return calculate_block_sizes_template<uint8_t, true>(ci, M, N, K);
+            }
+            else
+            {
+                return calculate_block_sizes_template<uint8_t, false>(ci, M, N, K);
+            }
+        case DataType::S8:
+            if(use_dot)
+            {
+                return calculate_block_sizes_template<int8_t, true>(ci, M, N, K);
+            }
+            else
+            {
+                return calculate_block_sizes_template<int8_t, false>(ci, M, N, K);
+            }
+#endif /* __aarch64__ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            return calculate_block_sizes_template<__fp16>(ci, M, N, K);
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("DataType not supported");
+            break;
+    }
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
index c9037ab..0fc3610 100644
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp

@@ -49,7 +49,7 @@
     p.N       = c->info()->tensor_shape().x();
     p.K       = a->info()->tensor_shape().x();
     p.multis  = b->info()->tensor_shape().z();
-    p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis;
+    p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
 
     return p;
 }

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
index 715fe70..2c9cd32 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.cpp

@@ -37,7 +37,7 @@
 void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::configure(const ITensor *prepared_a, const ITensor *transformed_b, ITensor *tmp_c, ITensor *c, const Window &block_walker,
                                                                                 const BlockSizes &block_sizes, const INEGEMMWrapperKernel::Params &params, bool b_is_pretransposed, float alpha, float beta, unsigned int max_num_threads)
 {
-    using strategy = typename Kernel<To>::strategy;
+    using strategy = typename Kernel<To, use_dot>::strategy;
 
     _prepared_a         = prepared_a;
     _transformed_b      = transformed_b;
@@ -57,7 +57,7 @@
 void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::transform(const MatrixMultiplyWorkload &wl, const ThreadInfo &info, const Window &batch_window, const Coordinates &start_offset,
                                                                                 const Coordinates &end_offset)
 {
-    using strategy = typename Kernel<To>::strategy;
+    using strategy = typename Kernel<To, use_dot>::strategy;
 
     strategy           strat(info.cpu_info);
     TensorAccessor<To> prepared_a(*_prepared_a);
@@ -98,7 +98,7 @@
 template <typename To, typename Tr, bool use_dot>
 void NEGEMMInterleavedMatrixMultiplyWrapperTemplate<To, Tr, use_dot>::create_workloads(std::vector<MatrixMultiplyWorkload> &workloads)
 {
-    using strategy = typename Kernel<To>::strategy;
+    using strategy = typename Kernel<To, use_dot>::strategy;
 
     unsigned int offset_transformed_b = 0;
     execute_window_loop(_block_walker, [&](const Coordinates & id)
@@ -127,6 +127,7 @@
     });
 }
 
+//TODO: regroup somewhere ?
 template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<float, float>;
 #ifdef __aarch64__
 template class NEGEMMInterleavedMatrixMultiplyWrapperTemplate<uint8_t, uint32_t>;

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
index f33a14f..6c201ce 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.cpp

@@ -89,7 +89,6 @@
     // Calculate the total size of the buffer:
     size_t total = num_full_k * normal_k_size * (num_full_x * normal_x_size + left_over_x_size);
     total += left_over_k_size * (left_over_x_size + num_full_x * normal_x_size);
-    total *= sizeof(To);
     return total;
 }
 

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
index 26a8ade..69842fe 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedStrategies.h

@@ -37,6 +37,10 @@
 #include "../arm_gemm/kernels/a64_gemm_u8_4x4.hpp"
 #include "../arm_gemm/kernels/a64_hgemm_24x8.hpp"
 #include "../arm_gemm/kernels/a64_sgemm_12x8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
+#include "../arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
 
 namespace arm_compute
 {
@@ -47,48 +51,82 @@
 {
 };
 
+#define DEFINE_STRATEGY_SUFFIX(strat, suffix)            \
+    using strategy                    = arm_gemm::strat; \
+    static constexpr const char *name = #strat suffix;
+
+#define DEFINE_STRATEGY(strat) \
+    DEFINE_STRATEGY_SUFFIX(strat, "")
+
+#ifdef __ARM_FEATURE_SVE
+template <>
+struct Kernel<float, false>
+{
+    DEFINE_STRATEGY(interleaved_fp32_mla_3VLx8)
+};
+template <>
+struct Kernel<float16_t, false>
+{
+    DEFINE_STRATEGY(interleaved_fp16_mla_3VLx8)
+};
+template <bool use_dot>
+struct Kernel<int8_t, use_dot>
+{
+    DEFINE_STRATEGY(interleaved_s8s32_dot_3VLx8)
+};
+template <bool use_dot>
+struct Kernel<uint8_t, use_dot>
+{
+    DEFINE_STRATEGY(interleaved_u8u32_dot_3VLx8)
+};
+#else /* __ARM_FEATURE_SVE */
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 struct Kernel<float16_t, false>
 {
-    using strategy = arm_gemm::hgemm_24x8;
+    DEFINE_STRATEGY(hgemm_24x8)
 };
 #endif /*__ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 #ifdef __aarch64__
 template <>
 struct Kernel<float, false>
 {
-    using strategy = arm_gemm::sgemm_12x8;
+    DEFINE_STRATEGY(sgemm_12x8)
 };
 template <>
 struct Kernel<int8_t, false>
 {
-    using strategy = arm_gemm::gemm_s8_4x4;
+    DEFINE_STRATEGY(gemm_s8_4x4)
 };
 template <>
 struct Kernel<uint8_t, false>
 {
-    using strategy = arm_gemm::gemm_u8_4x4;
+    DEFINE_STRATEGY(gemm_u8_4x4)
 };
 
 //Use different strategies for 8bit dot product:
 template <>
 struct Kernel<int8_t, true>
 {
-    using strategy = arm_gemm::gemm_s8_12x8;
+    DEFINE_STRATEGY_SUFFIX(gemm_s8_12x8, "_dot")
 };
 template <>
 struct Kernel<uint8_t, true>
 {
-    using strategy = arm_gemm::gemm_u8_12x8;
+    DEFINE_STRATEGY_SUFFIX(gemm_u8_12x8, "_dot")
 };
 #else
 template <>
 struct Kernel<float, false>
 {
-    using strategy = arm_gemm::sgemm_8x6;
+    DEFINE_STRATEGY(sgemm_8x6)
 };
 #endif /* __aarch64__ */
+#endif /* __ARM_FEATURE_SVE */
+
+#undef DEFINE_STRATEGY
+#undef DEFINE_STRATEGY_SUFFIX
 
 } // namespace
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
index 1780a18..3b80a1f 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.cpp

@@ -66,6 +66,7 @@
     }
 
     unsigned int last_m = 0;
+    //TODO: Create a new iterate_1D( DimY);
     int  last_y          = -1;
     auto window_iterator = arm_compute::create_window_iterator(batch_window, start_offset, end_offset, [&](const Coordinates & id)
     {

diff --git a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
index fb217f0..e452dfb 100644
--- a/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.cpp

@@ -102,6 +102,7 @@
         const unsigned int multi = id.z();
         const unsigned int ymax  = std::min(y0 + strategy::out_height(), m_end);
 
+        // TODO(COMPMID-1424) : Agree on gemm IO layouts
         strat.kernel(a(0, y0, batch, multi), a.stride(Window::DimY),
                      b(0, 0, multi), b.stride(Window::DimY),
                      c(0, y0, batch, multi), c.stride(Window::DimY),

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index 9b3a60d..ca1de26 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {
@@ -43,7 +43,7 @@
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index dba2330..9ce43f9 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index b946e5d..21e8f04 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {
@@ -43,7 +43,7 @@
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 2510941..c7113d0 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {
@@ -43,7 +43,7 @@
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   // Copy pointers

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
new file mode 100644
index 0000000..33b55df
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp

@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+}  // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index 44b93a1..c36c24e 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {
@@ -43,7 +43,7 @@
   float* const outptr,
   const int out_row_stride,
   const int out_col_stride,
-  const int, const int, const int, const int, const int, const int
+  const int, const int, const int, const int, const int, const int, const int, const int
 )
 {
   constexpr auto inner_tile_rows = DWC::inner_tile_rows;

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
new file mode 100644
index 0000000..8f22a64
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
new file mode 100644
index 0000000..09722d0
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp

@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+}  // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 8eb53a6..05315ee 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp

@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
 
 namespace depthwise
 {

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
new file mode 100644
index 0000000..cf51550
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp

@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+        ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+        ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+        },
+        {
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+                ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+        },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
new file mode 100644
index 0000000..dacfb24
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp

@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP16 to FP16
+template <int OutputTileRows, int OutputTileCols,
+          int KernelRows, int KernelCols,
+          int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
+{
+  typedef DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float16_t, float16_t
+  > DWC;
+
+  template <
+    bool Specialize=false,  // Specialize (or not) the method
+    int InPadTop=0,         // If specialized, top padding
+    int InPadLeft=0,        // If specialized, left padding
+    int InPadBottom=0,      // If specialized, bottom padding
+    int InPadRight=0,       // If specialized, right padding
+    int OutPadBottom=0,     // If specialized, bottom output padding
+    int OutPadRight=0       // If specialized, bottom right padding
+  >
+  static void process_tile(
+    const int n_channels,
+    const float16_t* const weights,
+    const int weight_row_stride,
+    const int weight_col_stride,
+    const float16_t* const inptr,
+    const int in_row_stride,
+    const int in_col_stride,
+    float16_t* const outptr,
+    const int out_row_stride,
+    const int out_col_stride,
+    const int in_pad_top=0,
+    const int in_pad_left=0,
+    const int in_pad_bottom=0,
+    const int in_pad_right=0,
+    const int out_pad_bottom=0,
+    const int out_pad_right=0,
+    const int input_offset=0,
+    const int weights_offset=0
+  );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+  bool Specialize,
+  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+  int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
+  const int n_channels,
+  const float16_t *__restrict__ const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float16_t *__restrict__ const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float16_t *__restrict__ const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int _in_pad_top,
+  const int _in_pad_left,
+  const int _in_pad_bottom,
+  const int _in_pad_right,
+  const int _out_pad_bottom,
+  const int _out_pad_right,
+  const int _input_offset,
+  const int _weights_offset
+)
+{
+  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+  constexpr auto kernel_rows = DWC::kernel_rows;
+  constexpr auto kernel_cols = DWC::kernel_cols;
+  constexpr auto output_tile_rows = DWC::output_tile_rows;
+  constexpr auto output_tile_cols = DWC::output_tile_cols;
+  constexpr auto stride_rows = DWC::stride_rows;
+  constexpr auto stride_cols = DWC::stride_cols;
+
+  // Extract parameters
+  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+  // Compute valid ranges of the tile
+  const int in_cells_i = inner_tile_rows - in_pad_bottom;
+  const int in_cells_j = inner_tile_cols - in_pad_right;
+  const int out_cells_i = output_tile_rows - out_pad_bottom;
+  const int out_cells_j = output_tile_cols - out_pad_right;
+
+  // Instantiate pointers
+  const float16_t* __restrict__ inptr_base = inptr;
+  const float16_t* __restrict__ wptr_base = weights;
+    float16_t* __restrict__ outptr_base = outptr;
+
+  // Perform the depthwise convolution
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 8; channels_remaining -= 8)
+  {
+    // Load input tile
+    float16x8_t u[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        if (i < in_pad_top || in_cells_i <= i ||
+            j < in_pad_left || in_cells_j <= j)
+        {
+          u[i][j] = vdupq_n_f16(0.0f);
+        }
+        else
+        {
+          u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
+        }
+      }
+    }
+    inptr_base += 8;
+
+    // Load weights tile
+    float16x8_t w[kernel_rows][kernel_cols];
+    for (int i = 0; i < kernel_rows; i++)
+    {
+      const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
+      }
+    }
+    wptr_base += 8;
+
+    // Perform the convolution
+    float16x8_t v[output_tile_rows][output_tile_cols];
+    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    {
+      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      {
+        // Base co-ordinate
+        const int base_i = out_i * stride_rows;
+        const int base_j = out_j * stride_cols;
+
+        // Fill the accumulator
+        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        {
+          const int i = base_i + in_i;
+          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          {
+            const int j = base_j + in_j;
+            if (in_i == 0 && in_j == 0)
+            {
+              // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+              v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
+            }
+            else
+            {
+              // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+              v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+            }
+          }
+        }
+      }
+    }
+
+    // Store the output tile
+    for (int i = 0; i < out_cells_i; i++)
+    {
+      float16_t* const outptr_row = outptr_base + i*out_row_stride;
+      for (int j = 0; j < out_cells_j; j++)
+      {
+        vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
+      }
+    }
+    outptr_base += 8;
+  }
+#endif  // __aarch64__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load input tile
+    float16_t u[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        if (i < in_pad_top || in_cells_i <= i ||
+            j < in_pad_left || in_cells_j <= j)
+        {
+          u[i][j] = static_cast<float16_t>(0);
+        }
+        else
+        {
+          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+        }
+      }
+    }
+    inptr_base++;
+
+    // Load weights tile
+    float16_t w[kernel_rows][kernel_cols];
+    for (int i = 0; i < kernel_rows; i++)
+    {
+      const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[i][j] = *(wptr_row + j*weight_col_stride);
+      }
+    }
+    wptr_base++;
+
+    // Perform the convolution
+    float16_t v[output_tile_rows][output_tile_cols];
+    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    {
+      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      {
+        // Clear the accumulator
+        v[out_i][out_j] = static_cast<float16_t>(0);
+
+        // Base co-ordinate
+        const int base_i = out_i * stride_rows;
+        const int base_j = out_j * stride_cols;
+
+        // Fill the accumulator
+        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        {
+          const int i = base_i + in_i;
+          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          {
+            const int j = base_j + in_j;
+            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+          }
+        }
+      }
+    }
+
+    // Store the output tile
+    for (int i = 0; i < out_cells_i; i++)
+    {
+      float16_t* const outptr_row = outptr_base + i*out_row_stride;
+      for (int j = 0; j < out_cells_j; j++)
+      {
+        *(outptr_row + j*out_col_stride) = v[i][j];
+      }
+    }
+    outptr_base++;
+  }
+}
+}  // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
new file mode 100644
index 0000000..840086f
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp

@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP32 to FP32
+template <int OutputTileRows, int OutputTileCols,
+          int KernelRows, int KernelCols,
+          int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
+{
+  typedef DepthwiseConvolution<
+    OutputTileRows, OutputTileCols,
+    KernelRows, KernelCols,
+    StrideRows, StrideCols,
+    float, float
+  > DWC;
+
+  template <
+    bool Specialize=false,  // Specialize (or not) the method
+    int InPadTop=0,         // If specialized, top padding
+    int InPadLeft=0,        // If specialized, left padding
+    int InPadBottom=0,      // If specialized, bottom padding
+    int InPadRight=0,       // If specialized, right padding
+    int OutPadBottom=0,     // If specialized, bottom output padding
+    int OutPadRight=0       // If specialized, bottom right padding
+  >
+  static void process_tile(
+    const int n_channels,
+    const float* const weights,
+    const int weight_row_stride,
+    const int weight_col_stride,
+    const float* const inptr,
+    const int in_row_stride,
+    const int in_col_stride,
+    float* const outptr,
+    const int out_row_stride,
+    const int out_col_stride,
+    const int in_pad_top=0,
+    const int in_pad_left=0,
+    const int in_pad_bottom=0,
+    const int in_pad_right=0,
+    const int out_pad_bottom=0,
+    const int out_pad_right=0,
+    const int input_offset=0,
+    const int weights_offset=0
+  );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+  bool Specialize,
+  int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+  int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
+  const int n_channels,
+  const float *__restrict__ const weights,
+  const int weight_row_stride,
+  const int weight_col_stride,
+  const float *__restrict__ const inptr,
+  const int in_row_stride,
+  const int in_col_stride,
+  float *__restrict__ const outptr,
+  const int out_row_stride,
+  const int out_col_stride,
+  const int _in_pad_top,
+  const int _in_pad_left,
+  const int _in_pad_bottom,
+  const int _in_pad_right,
+  const int _out_pad_bottom,
+  const int _out_pad_right,
+  const int _input_offset,
+  const int _weights_offset
+)
+{
+  constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+  constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+  constexpr auto kernel_rows = DWC::kernel_rows;
+  constexpr auto kernel_cols = DWC::kernel_cols;
+  constexpr auto output_tile_rows = DWC::output_tile_rows;
+  constexpr auto output_tile_cols = DWC::output_tile_cols;
+  constexpr auto stride_rows = DWC::stride_rows;
+  constexpr auto stride_cols = DWC::stride_cols;
+
+  // Extract parameters
+  const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+  const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+  const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+  const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+  const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+  const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+  // Compute valid ranges of the tile
+  const int in_cells_i = inner_tile_rows - in_pad_bottom;
+  const int in_cells_j = inner_tile_cols - in_pad_right;
+  const int out_cells_i = output_tile_rows - out_pad_bottom;
+  const int out_cells_j = output_tile_cols - out_pad_right;
+
+  // Instantiate pointers
+  const float* __restrict__ inptr_base = inptr;
+  const float* __restrict__ wptr_base = weights;
+  float* __restrict__ outptr_base = outptr;
+
+  // Perform the depthwise convolution
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Load input tile
+    float32x4_t u[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        if (i < in_pad_top || in_cells_i <= i ||
+            j < in_pad_left || in_cells_j <= j)
+        {
+          u[i][j] = vdupq_n_f32(0.0f);
+        }
+        else
+        {
+          u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
+        }
+      }
+    }
+    inptr_base += 4;
+
+    // Load weights tile
+    float32x4_t w[kernel_rows][kernel_cols];
+    for (int i = 0; i < kernel_rows; i++)
+    {
+      const float* const wptr_row = wptr_base + i*weight_row_stride;
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+      }
+    }
+    wptr_base += 4;
+
+    // Perform the convolution
+    float32x4_t v[output_tile_rows][output_tile_cols];
+    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    {
+      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      {
+        // Base co-ordinate
+        const int base_i = out_i * stride_rows;
+        const int base_j = out_j * stride_cols;
+
+        // Fill the accumulator
+        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        {
+          const int i = base_i + in_i;
+          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          {
+            const int j = base_j + in_j;
+            if (in_i == 0 && in_j == 0)
+            {
+              // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+              v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
+            }
+            else
+            {
+              // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+              v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+            }
+          }
+        }
+      }
+    }
+
+    // Store the output tile
+    for (int i = 0; i < out_cells_i; i++)
+    {
+      float* const outptr_row = outptr_base + i*out_row_stride;
+      for (int j = 0; j < out_cells_j; j++)
+      {
+        vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
+      }
+    }
+    outptr_base += 4;
+  }
+#endif  // __aarch64__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load input tile
+    float u[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        if (i < in_pad_top || in_cells_i <= i ||
+            j < in_pad_left || in_cells_j <= j)
+        {
+          u[i][j] = static_cast<float>(0);
+        }
+        else
+        {
+          u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+        }
+      }
+    }
+    inptr_base++;
+
+    // Load weights tile
+    float w[kernel_rows][kernel_cols];
+    for (int i = 0; i < kernel_rows; i++)
+    {
+      const float* const wptr_row = wptr_base + i*weight_row_stride;
+      for (int j = 0; j < kernel_cols; j++)
+      {
+        w[i][j] = *(wptr_row + j*weight_col_stride);
+      }
+    }
+    wptr_base++;
+
+    // Perform the convolution
+    float v[output_tile_rows][output_tile_cols];
+    for (int out_i = 0; out_i < out_cells_i; out_i++)
+    {
+      for (int out_j = 0; out_j < out_cells_j; out_j++)
+      {
+        // Clear the accumulator
+        v[out_i][out_j] = static_cast<float>(0);
+
+        // Base co-ordinate
+        const int base_i = out_i * stride_rows;
+        const int base_j = out_j * stride_cols;
+
+        // Fill the accumulator
+        for (int in_i = 0; in_i < kernel_rows; in_i++)
+        {
+          const int i = base_i + in_i;
+          for (int in_j = 0; in_j < kernel_cols; in_j++)
+          {
+            const int j = base_j + in_j;
+            v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+          }
+        }
+      }
+    }
+
+    // Store the output tile
+    for (int i = 0; i < out_cells_i; i++)
+    {
+      float* const outptr_row = outptr_base + i*out_row_stride;
+      for (int j = 0; j < out_cells_j; j++)
+      {
+        *(outptr_row + j*out_col_stride) = v[i][j];
+      }
+    }
+    outptr_base++;
+  }
+}
+
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
new file mode 100644
index 0000000..d0d8de5
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp

@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ *          NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for U8 to S32
+template <int OutputTileRows, int OutputTileCols,
+        int KernelRows, int KernelCols,
+        int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
+{
+    typedef DepthwiseConvolution<
+            OutputTileRows, OutputTileCols,
+            KernelRows, KernelCols,
+            StrideRows, StrideCols,
+            uint8_t, int32_t
+    > DWC;
+
+    template <
+            bool Specialize=false,  // Specialize (or not) the method
+            int InPadTop=0,         // If specialized, top padding
+            int InPadLeft=0,        // If specialized, left padding
+            int InPadBottom=0,      // If specialized, bottom padding
+            int InPadRight=0,       // If specialized, right padding
+            int OutPadBottom=0,     // If specialized, bottom output padding
+            int OutPadRight=0       // If specialized, bottom right padding
+    >
+    static void process_tile(
+            const int n_channels,
+            const uint8_t* const weights,
+            const int weight_row_stride,
+            const int weight_col_stride,
+            const uint8_t* const inptr,
+            const int in_row_stride,
+            const int in_col_stride,
+            int32_t* const outptr,
+            const int out_row_stride,
+            const int out_col_stride,
+            const int in_pad_top=0,
+            const int in_pad_left=0,
+            const int in_pad_bottom=0,
+            const int in_pad_right=0,
+            const int out_pad_bottom=0,
+            const int out_pad_right=0,
+            const int input_offset=0,
+            const int weights_offset=0);
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+        bool Specialize,
+        int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+        int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
+        const int n_channels,
+        const uint8_t *__restrict__ const weights,
+        const int weight_row_stride,
+        const int weight_col_stride,
+        const uint8_t *__restrict__ const inptr,
+        const int in_row_stride,
+        const int in_col_stride,
+        int32_t *__restrict__ const outptr,
+        const int out_row_stride,
+        const int out_col_stride,
+        const int _in_pad_top,
+        const int _in_pad_left,
+        const int _in_pad_bottom,
+        const int _in_pad_right,
+        const int _out_pad_bottom,
+        const int _out_pad_right,
+        const int _input_offset,
+        const int _weights_offset
+)
+{
+    constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+    constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+    constexpr auto kernel_rows = DWC::kernel_rows;
+    constexpr auto kernel_cols = DWC::kernel_cols;
+    constexpr auto output_tile_rows = DWC::output_tile_rows;
+    constexpr auto output_tile_cols = DWC::output_tile_cols;
+    constexpr auto stride_rows = DWC::stride_rows;
+    constexpr auto stride_cols = DWC::stride_cols;
+
+    // Extract parameters
+    const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+    const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+    const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+    const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+    const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+    const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+    // Compute valid ranges of the tile
+    const int in_cells_i = inner_tile_rows - in_pad_bottom;
+    const int in_cells_j = inner_tile_cols - in_pad_right;
+    const int out_cells_i = output_tile_rows - out_pad_bottom;
+    const int out_cells_j = output_tile_cols - out_pad_right;
+
+    // Instantiate pointers
+    const uint8_t* __restrict__ inptr_base = inptr;
+    const uint8_t* __restrict__ wptr_base = weights;
+    int32_t* __restrict__ outptr_base = outptr;
+
+    // Perform the depthwise convolution
+    int channels_remaining = n_channels;
+#ifdef __aarch64__
+    const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
+    const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
+    for (; channels_remaining >= 16; channels_remaining -= 16)
+    {
+        // Load input tile
+        int32x4x4_t u[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                if (i < in_pad_top || in_cells_i <= i ||
+                    j < in_pad_left || in_cells_j <= j)
+                {
+                    u[i][j].val[0] = vdupq_n_s32(0);
+                    u[i][j].val[1] = vdupq_n_s32(0);
+                    u[i][j].val[2] = vdupq_n_s32(0);
+                    u[i][j].val[3] = vdupq_n_s32(0);
+                }
+                else
+                {
+                    const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
+                    u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
+                    u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
+                    u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
+                    u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
+                }
+            }
+        }
+        inptr_base += 16;
+
+        // Load weights tile
+        int32x4x4_t w[kernel_rows][kernel_cols];
+        for (int i = 0; i < kernel_rows; i++)
+        {
+            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+            for (int j = 0; j < kernel_cols; j++)
+            {
+                const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
+                w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
+                w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
+                w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
+                w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
+            }
+        }
+        wptr_base += 16;
+
+        // Perform the convolution
+        int32x4x4_t v[output_tile_rows][output_tile_cols];
+        for (int out_i = 0; out_i < out_cells_i; out_i++)
+        {
+            for (int out_j = 0; out_j < out_cells_j; out_j++)
+            {
+                // Base co-ordinate
+                const int base_i = out_i * stride_rows;
+                const int base_j = out_j * stride_cols;
+
+                // Fill the accumulator
+                for (int in_i = 0; in_i < kernel_rows; in_i++)
+                {
+                    const int i = base_i + in_i;
+                    for (int in_j = 0; in_j < kernel_cols; in_j++)
+                    {
+                        const int j = base_j + in_j;
+                        if (in_i == 0 && in_j == 0)
+                        {
+                            // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+                            v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
+                            v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
+                            v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
+                            v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
+                        }
+                        else
+                        {
+                            // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+                            v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
+                            v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
+                            v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
+                            v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Store the output tile
+        for (int i = 0; i < out_cells_i; i++)
+        {
+            int32_t* const outptr_row = outptr_base + i*out_row_stride;
+            for (int j = 0; j < out_cells_j; j++)
+            {
+                vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
+                vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
+                vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
+                vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
+            }
+        }
+        outptr_base += 16;
+    }
+#endif  // __aarch64__
+    for (; channels_remaining; channels_remaining--)
+    {
+        // Load input tile
+        int32_t u[inner_tile_rows][inner_tile_cols];
+        for (int i = 0; i < inner_tile_rows; i++)
+        {
+            const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+            for (int j = 0; j < inner_tile_cols; j++)
+            {
+                if (i < in_pad_top || in_cells_i <= i ||
+                    j < in_pad_left || in_cells_j <= j)
+                {
+                    u[i][j] = static_cast<uint8_t>(0);
+                }
+                else
+                {
+                    u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
+                }
+            }
+        }
+        inptr_base++;
+
+        // Load weights tile
+        int32_t w[kernel_rows][kernel_cols];
+        for (int i = 0; i < kernel_rows; i++)
+        {
+            const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+            for (int j = 0; j < kernel_cols; j++)
+            {
+                w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
+            }
+        }
+        wptr_base++;
+
+        // Perform the convolution
+        int32_t v[output_tile_rows][output_tile_cols];
+        for (int out_i = 0; out_i < out_cells_i; out_i++)
+        {
+            for (int out_j = 0; out_j < out_cells_j; out_j++)
+            {
+                // Clear the accumulator
+                v[out_i][out_j] = static_cast<int32_t>(0);
+
+                // Base co-ordinate
+                const int base_i = out_i * stride_rows;
+                const int base_j = out_j * stride_cols;
+
+                // Fill the accumulator
+                for (int in_i = 0; in_i < kernel_rows; in_i++)
+                {
+                    const int i = base_i + in_i;
+                    for (int in_j = 0; in_j < kernel_cols; in_j++)
+                    {
+                        const int j = base_j + in_j;
+                        v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+                    }
+                }
+            }
+        }
+
+        // Store the output tile
+        for (int i = 0; i < out_cells_i; i++)
+        {
+            int32_t* const outptr_row = outptr_base + i*out_row_stride;
+            for (int j = 0; j < out_cells_j; j++)
+            {
+                *(outptr_row + j*out_col_stride) = v[i][j];
+            }
+        }
+        outptr_base++;
+    }
+}
+
+}  // namespace depthwise

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp
new file mode 100644
index 0000000..e66300d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp

@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_1x8_fp32_process_tile(
+  int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* const matrix_base,
+  const int matrix_stride,
+     const int _pad_top,
+     const int _pad_left,
+     const int _pad_bottom,
+    const int _pad_right
+)
+{
+  (void) input_row_stride;  // No rows over which to stride
+ (void) _pad_top;  // Never any top padding
+  (void) _pad_bottom;  // Never any bottom padding
+
+  // Extract padding arguments
+  const int pad_left = Specialized ? PadLeft : _pad_left;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+  constexpr int inner_tile_cols = 8;
+  const int cells_j = inner_tile_cols - pad_right;
+
+  float *outptr = matrix_base;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_cols];
+  for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+  {
+    x_ptrs[j] = input_base + xj*input_col_stride;
+  }
+
+  // Vectors used/computed in this kernel.
+  float x[inner_tile_cols];
+  float U[inner_tile_cols];
+
+  for (int j = 0; j < inner_tile_cols; j++)
+  {
+    x[j] = 0.0f;
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef __arm_any__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    float32x4_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdupq_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      x[j] = vld1q_f32(x_ptrs[j]);
+      x_ptrs[j] += 4;
+    }
+
+    // Compute U = x . X
+    U[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1q_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 4;
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    float32x2_t x[inner_tile_cols], U[inner_tile_cols];
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[j] = vdup_n_f32(0.0f);
+    }
+
+    // Load x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      x[j] = vld1_f32(x_ptrs[j]);
+      x_ptrs[j] += 2;
+    }
+
+    // Compute U = x . X
+    U[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 49), x[4], -14), x[0], -36);
+    U[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[2], 36), x[3], 13), x[4], -13), x[1], -36), x[5], -1);
+    U[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[5], 1), x[2], 36), x[1], 36), x[4], -13), x[3], -13);
+    U[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 20), x[2], 9), x[5], -2), x[4], -10), x[1], -18);
+    U[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 18), x[2], 9), x[5], 2), x[4], -10), x[3], -20);
+    U[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[3], 15), x[2], 4), x[5], -3), x[4], -5), x[1], -12);
+    U[6] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[6], 1), x[1], 12), x[2], 4), x[5], 3), x[4], -5), x[3], -15);
+    U[7] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(x[7], 1), x[3], 49), x[5], -14), x[1], -36);
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      vst1_f32(outptr + j*matrix_stride, U[j]);
+    }
+    outptr += 2;
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      x[j] = *(x_ptrs[j]++);
+    }
+
+    // Compute U = x . X
+    U[0] = x[0]*-36 + x[4]*-14 + x[2]*49 + x[6]*1;
+    U[1] = x[5]*-1 + x[1]*-36 + x[4]*-13 + x[3]*13 + x[2]*36 + x[6]*1;
+    U[2] = x[3]*-13 + x[4]*-13 + x[1]*36 + x[2]*36 + x[5]*1 + x[6]*1;
+    U[3] = x[1]*-18 + x[4]*-10 + x[5]*-2 + x[2]*9 + x[3]*20 + x[6]*1;
+    U[4] = x[3]*-20 + x[4]*-10 + x[5]*2 + x[2]*9 + x[1]*18 + x[6]*1;
+    U[5] = x[1]*-12 + x[4]*-5 + x[5]*-3 + x[2]*4 + x[3]*15 + x[6]*1;
+    U[6] = x[3]*-15 + x[4]*-5 + x[5]*3 + x[2]*4 + x[1]*12 + x[6]*1;
+    U[7] = x[1]*-36 + x[5]*-14 + x[3]*49 + x[7]*1;
+
+    // Store the transformed vector
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      *(outptr + j*matrix_stride) = U[j];
+    }
+    outptr++;
+  }
+}
+
+}
+
+namespace winograd
+{
+template <int x>
+using Tiles = InputTransformImplTiles<1, x, 1, 8, float>;
+
+/*****************************************************************************/
+// 1x3 specialisations
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+/*****************************************************************************/
+// 1x5 specialisations
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+/*****************************************************************************/
+// 1x7 specialisations
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>;
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>,
+};
+
+template <>
+const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>,
+  winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>,
+};
+/*****************************************************************************/
+
+
+template class InputTransform<1, 3, 1, 8, float>;
+template class InputTransform<3, 1, 8, 1, float>;
+template class InputTransform<1, 5, 1, 8, float>;
+template class InputTransform<5, 1, 8, 1, float>;
+template class InputTransform<1, 7, 1, 8, float>;
+template class InputTransform<7, 1, 8, 1, float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
index 6d8afc0..4203945 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp

@@ -29,91 +29,36 @@
 namespace winograd
 {
 
-using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>;
 
-/******************************************************************************
- * Cost methods for the input transform.
- * =====================================
- */
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
+namespace
 {
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
-  const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
-  return 16 * 16 * tile_M * tile_N * input_shape.n_channels;
-}
-/*****************************************************************************/
 
-/*****************************************************************************
-* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an image
-* can require one row or column of padding on their top and left sides if the
-* padding type is SAME (where X represents a padded value):
-*
-*      _______    _______
-*     |X X X X|  |X X X X|
-*     |X      |  |       |   . . .
-*     |X      |  |       |
-*     |X______|  |_______|
-*      _______
-*     |X      |             .
-*     |X      |   . . .       .
-*     |X      |                 .
-*     |X______|
-*
-* For tiles near the right or bottom of the image it is more complicated.  Such
-* tiles might require padding by 0 or 1 rows or columns if the padding type is
-* VALID or 1 or 2 rows or columns if the padding type is SAME:
-*
-*      _______    _______    _______    _______
-*     |X X X X|  |X X X X|  |X X X X|  |X X X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X______|  |_______|  |______X|  |____X_X|
-*      _______    _______    _______    _______
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X______|  |_______|  |______X|  |____X_X|
-*      _______    _______    _______    _______
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X_X_X_X|  |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*      _______    _______    _______    _______
-*     |X      |  |       |  |      X|  |    X X|
-*     |X      |  |       |  |      X|  |    X X|
-*     |X X X X|  |X X X X|  |X X X X|  |X X X X|
-*     |X_X_X_X|  |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-* Additional tiles are required for especially small input images.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-*     Padding top in {0, 1}
-*     Padding left in {0, 1}
-*     Padding bottom in {0, 1, 2}
-*     Padding right in {0, 1, 2}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_4x4_fp32_process_tile(
   int n_channels,
   const float* const input_base,
   const int input_row_stride,
   const int input_col_stride,
   float* const matrix_base,
-  const int matrix_stride
-)
+    const int matrix_stride,
+     const int _pad_top,
+     const int _pad_left,
+     const int _pad_bottom,
+     const int _pad_right
+  )
 {
+const int pad_top = Specialized ? PadTop : _pad_top;
+  const int pad_left = Specialized ? PadLeft : _pad_left;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
   constexpr int inner_tile_i = 4, inner_tile_j = 4;
-  constexpr int cells_i = inner_tile_i - pad_bottom;
-  constexpr int cells_j = inner_tile_i - pad_right;
+  const int cells_i = inner_tile_i - pad_bottom;
+  const int cells_j = inner_tile_i - pad_right;
+
+
 
   float *outptr = matrix_base;
 
@@ -327,83 +272,40 @@
   }
 }
 
+}  // namespace (anonymous)
+
 template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>;
+
 template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
-  {
-    {
-      {
-        Transform::template process_tile<0, 0, 0, 0>,  // No padding
-        Transform::template process_tile<0, 0, 0, 1>,  // Right
-        Transform::template process_tile<0, 0, 0, 2>,  // Right
-      },
-      {
-        Transform::template process_tile<0, 0, 1, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 1, 1>,  // Bottom-right
-        Transform::template process_tile<0, 0, 1, 2>,  // Bottom-right
-      },
-      {
-        Transform::template process_tile<0, 0, 2, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 2, 1>,  // Bottom-right
-        Transform::template process_tile<0, 0, 2, 2>,  // Bottom-right
-      }
-    },
-    {
-      {
-        Transform::template process_tile<0, 1, 0, 0>,  // Left
-        Transform::template process_tile<0, 1, 0, 1>,  // Left AND right
-        Transform::template process_tile<0, 1, 0, 2>,  // Left AND right
-      },
-      {
-        Transform::template process_tile<0, 1, 1, 0>,  // Left-bottom
-        Transform::template process_tile<0, 1, 1, 1>,  // Left, bottom AND right
-        Transform::template process_tile<0, 1, 1, 2>,  // Left, bottom AND right
-      },
-      {
-        Transform::template process_tile<0, 1, 2, 0>,  // Left-bottom
-        Transform::template process_tile<0, 1, 2, 1>,  // Left, bottom AND right
-        Transform::template process_tile<0, 1, 2, 2>,  // Left, bottom AND right
-      }
-    },
-  },
-  {
-    {
-      {
-        Transform::template process_tile<1, 0, 0, 0>,  // Top
-        Transform::template process_tile<1, 0, 0, 1>,  // Top-right
-        Transform::template process_tile<1, 0, 0, 2>,  // Top-right
-      },
-      {
-        Transform::template process_tile<1, 0, 1, 0>,  // Top AND bottom
-        Transform::template process_tile<1, 0, 1, 1>,  // Top, bottom AND right
-        Transform::template process_tile<1, 0, 1, 2>,  // Top, bottom AND right
-      },
-      {
-        Transform::template process_tile<1, 0, 2, 0>,  // Top AND bottom
-        Transform::template process_tile<1, 0, 2, 1>,  // Top, bottom AND right
-        Transform::template process_tile<1, 0, 2, 2>,  // Top, bottom AND right
-      }
-    },
-    {
-      {
-        Transform::template process_tile<1, 1, 0, 0>,  // Top-left
-        Transform::template process_tile<1, 1, 0, 1>,  // Top, left AND right
-        Transform::template process_tile<1, 1, 0, 2>,  // Top, left AND right
-      },
-      {
-        Transform::template process_tile<1, 1, 1, 0>,  // Top, left AND bottom
-        Transform::template process_tile<1, 1, 1, 1>,  // All padded
-        Transform::template process_tile<1, 1, 1, 2>,  // All padded
-      },
-      {
-        Transform::template process_tile<1, 1, 2, 0>,  // Top, left AND bottom
-        Transform::template process_tile<1, 1, 2, 1>,  // All padded
-        Transform::template process_tile<1, 1, 2, 2>,  // All padded
-      }
-    }
-  }
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>;
+
+
+template <>
+const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = {
+  winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>,
 };
 
-template struct WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>,
+};
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>,
+};
+
+template class InputTransform<3, 3, 4, 4, float>;
 }  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
deleted file mode 100644
index ebc0c07..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_5x5_fp32.cpp
+++ /dev/null

@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Transform = WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
-{
-  (void) input_shape;
-  return 0;
-}
-
-/*****************************************************************************
-* F(2x2, 5x5) implies the use of a 6x6 input tile.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-*     Padding top in {0, 2}
-*     Padding left in {0, 2}
-*     Padding bottom in {0, 1, 2, 3, 4}
-*     Padding right in {0, 1, 2, 3, 4}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
-  int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* const matrix_base,
-  const int matrix_stride
-)
-{
-  constexpr int cells_i = 6 - pad_bottom;
-  constexpr int cells_j = 6 - pad_right;
-
-  float *outptr = matrix_base;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[6][6];
-  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[6][6], XTx[6][6], U[6][6];
-  for (int i = 0; i < 6; i++)
-  {
-    for (int j = 0; j < 6; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel
-    float32x4_t x[6][6], XTx[6][6], U[6][6];
-    for (int i = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++)
-      {
-        x[i][j] = vdupq_n_f32(0.0f);
-        XTx[i][j] = vdupq_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1q_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel
-    float32x2_t x[6][6], XTx[6][6], U[6][6];
-    for (int i = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
-  {
-    {
-      {
-        Transform::template process_tile<0, 0, 0, 0>,  // No padding
-        Transform::template process_tile<0, 0, 0, 1>,  // Right
-        Transform::template process_tile<0, 0, 0, 2>,  // "   "
-        Transform::template process_tile<0, 0, 0, 3>,  // "   "
-        Transform::template process_tile<0, 0, 0, 4>,  // "   "
-      },
-      {
-        Transform::template process_tile<0, 0, 1, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 1, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 1, 2>,  // "          "
-        Transform::template process_tile<0, 0, 1, 3>,  // "          "
-        Transform::template process_tile<0, 0, 1, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 2, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 2, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 2, 2>,  // "          "
-        Transform::template process_tile<0, 0, 2, 3>,  // "          "
-        Transform::template process_tile<0, 0, 2, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 3, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 3, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 3, 2>,  // "          "
-        Transform::template process_tile<0, 0, 3, 3>,  // "          "
-        Transform::template process_tile<0, 0, 3, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 4, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 4, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 4, 2>,  // "          "
-        Transform::template process_tile<0, 0, 4, 3>,  // "          "
-        Transform::template process_tile<0, 0, 4, 4>,  // "          "
-      }
-    },
-    {
-      {
-        Transform::template process_tile<0, 2, 0, 0>,  // Left
-        Transform::template process_tile<0, 2, 0, 1>,
-        Transform::template process_tile<0, 2, 0, 2>,
-        Transform::template process_tile<0, 2, 0, 3>,
-        Transform::template process_tile<0, 2, 0, 4>,
-      },
-      {
-        Transform::template process_tile<0, 2, 1, 0>,  // Bottom left
-        Transform::template process_tile<0, 2, 1, 1>,
-        Transform::template process_tile<0, 2, 1, 2>,
-        Transform::template process_tile<0, 2, 1, 3>,
-        Transform::template process_tile<0, 2, 1, 4>,
-      },
-      {
-        Transform::template process_tile<0, 2, 2, 0>,  // "          "
-        Transform::template process_tile<0, 2, 2, 1>,
-        Transform::template process_tile<0, 2, 2, 2>,
-        Transform::template process_tile<0, 2, 2, 3>,
-        Transform::template process_tile<0, 2, 2, 4>,
-      },
-      {
-        Transform::template process_tile<0, 2, 3, 0>,  // "          "
-        Transform::template process_tile<0, 2, 3, 1>,
-        Transform::template process_tile<0, 2, 3, 2>,
-        Transform::template process_tile<0, 2, 3, 3>,
-        Transform::template process_tile<0, 2, 3, 4>,
-      },
-      {
-        Transform::template process_tile<0, 2, 4, 0>,  // "          "
-        Transform::template process_tile<0, 2, 4, 1>,
-        Transform::template process_tile<0, 2, 4, 2>,
-        Transform::template process_tile<0, 2, 4, 3>,
-        Transform::template process_tile<0, 2, 4, 4>,
-      }
-    }
-  },
-  {
-    {
-      {
-        Transform::template process_tile<2, 0, 0, 0>,  // Top
-        Transform::template process_tile<2, 0, 0, 1>,  // Top right
-        Transform::template process_tile<2, 0, 0, 2>,  // "       "
-        Transform::template process_tile<2, 0, 0, 3>,  // "       "
-        Transform::template process_tile<2, 0, 0, 4>,  // "       "
-      },
-      {
-        Transform::template process_tile<2, 0, 1, 0>,
-        Transform::template process_tile<2, 0, 1, 1>,
-        Transform::template process_tile<2, 0, 1, 2>,
-        Transform::template process_tile<2, 0, 1, 3>,
-        Transform::template process_tile<2, 0, 1, 4>,
-      },
-      {
-        Transform::template process_tile<2, 0, 2, 0>,
-        Transform::template process_tile<2, 0, 2, 1>,
-        Transform::template process_tile<2, 0, 2, 2>,
-        Transform::template process_tile<2, 0, 2, 3>,
-        Transform::template process_tile<2, 0, 2, 4>,
-      },
-      {
-        Transform::template process_tile<2, 0, 3, 0>,
-        Transform::template process_tile<2, 0, 3, 1>,
-        Transform::template process_tile<2, 0, 3, 2>,
-        Transform::template process_tile<2, 0, 3, 3>,
-        Transform::template process_tile<2, 0, 3, 4>,
-      },
-      {
-        Transform::template process_tile<2, 0, 4, 0>,
-        Transform::template process_tile<2, 0, 4, 1>,
-        Transform::template process_tile<2, 0, 4, 2>,
-        Transform::template process_tile<2, 0, 4, 3>,
-        Transform::template process_tile<2, 0, 4, 4>,
-      },
-    },
-    {
-      {
-        Transform::template process_tile<2, 2, 0, 0>,  // Top left
-        Transform::template process_tile<2, 2, 0, 1>,
-        Transform::template process_tile<2, 2, 0, 2>,
-        Transform::template process_tile<2, 2, 0, 3>,
-        Transform::template process_tile<2, 2, 0, 4>,
-      },
-      {
-        Transform::template process_tile<2, 2, 1, 0>,
-        Transform::template process_tile<2, 2, 1, 1>,
-        Transform::template process_tile<2, 2, 1, 2>,
-        Transform::template process_tile<2, 2, 1, 3>,
-        Transform::template process_tile<2, 2, 1, 4>,
-      },
-      {
-        Transform::template process_tile<2, 2, 2, 0>,
-        Transform::template process_tile<2, 2, 2, 1>,
-        Transform::template process_tile<2, 2, 2, 2>,
-        Transform::template process_tile<2, 2, 2, 3>,
-        Transform::template process_tile<2, 2, 2, 4>,
-      },
-      {
-        Transform::template process_tile<2, 2, 3, 0>,
-        Transform::template process_tile<2, 2, 3, 1>,
-        Transform::template process_tile<2, 2, 3, 2>,
-        Transform::template process_tile<2, 2, 3, 3>,
-        Transform::template process_tile<2, 2, 3, 4>,
-      },
-      {
-        Transform::template process_tile<2, 2, 4, 0>,
-        Transform::template process_tile<2, 2, 4, 1>,
-        Transform::template process_tile<2, 2, 4, 2>,
-        Transform::template process_tile<2, 2, 4, 3>,
-        Transform::template process_tile<2, 2, 4, 4>,
-      }
-    }
-  }
-};
-
-template struct WinogradGEMM<2, 2, 5, 5>::InputTransform<float>;
-}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
deleted file mode 100644
index 04d1573..0000000
--- a/src/core/NEON/kernels/convolution/winograd/transforms/input_4x4_3x3_fp32.cpp
+++ /dev/null

@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
-
-namespace winograd
-{
-
-using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &input_shape)
-{
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
-  const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
-  return 12 * 24 * tile_M * tile_N * input_shape.n_channels;
-}
-
-/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a
-* variety of padding types. For example, tiles at the top and left of an
-* image can require one row or column of padding on their top and left sides
-* if the padding type is SAME (where X represents a padded value):
-*
-*      ___________    ___________
-*     |X X X X X X|  |X X X X X X|
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X          |  |           |
-*     |X__________|  |___________|
-*      ___________
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X          |
-*     |X__________|
-*
-* For tiles near the right or bottom of the image it is more complicated.
-* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the
-* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding
-* type is SAME.
-*
-* Build an array of the specialised methods that deal with each of the
-* different padding combinations which may be required. These padding
-* constraints are the space:
-*
-*     Padding top in {0, 1}
-*     Padding left in {0, 1}
-*     Padding bottom in {0, 1, 2, 3, 4}
-*     Padding right in {0, 1, 2, 3, 4}
-*/
-template <>
-template <>
-template <int pad_top, int pad_left, int pad_bottom, int pad_right>
-void Transform::process_tile(
-  int n_channels,
-  const float* const input_base,
-  const int input_row_stride,
-  const int input_col_stride,
-  float* const matrix_base,
-  const int matrix_stride
-)
-{
-  constexpr int cells_i = 6 - pad_bottom;
-  constexpr int cells_j = 6 - pad_right;
-
-  float *outptr = matrix_base;
-
-  // Get pointers into the input tile
-  const float *x_ptrs[6][6];
-  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
-  {
-    // Get a pointer into the row
-    const float* const row_ptr = input_base + xi*input_row_stride;
-
-    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
-    {
-      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
-    }
-  }
-
-  // Matrices used/computed in this kernel.
-  float x[6][6], XTx[6][6], U[6][6];
-  for (int i = 0; i < 6; i++)
-  {
-    for (int j = 0; j < 6; j++)
-    {
-      x[i][j] = XTx[i][j] = 0.0f;
-    }
-  }
-
-  // Perform the Winograd input transformation for each channel in the input
-  // tensor.
-  int channels_remaining = n_channels;
-#ifdef __aarch64__
-  for (; channels_remaining >= 4; channels_remaining -= 4)
-  {
-    // Matrices used/computed in this kernel
-    float32x4_t x[6][6], XTx[6][6], U[6][6];
-    for (int i = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++)
-      {
-        x[i][j] = vdupq_n_f32(0.0f);
-        XTx[i][j] = vdupq_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1q_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 4;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 4;
-  }
-#endif  // __aarch64__
-#ifdef __arm_any__
-  for (; channels_remaining >= 2; channels_remaining -= 2)
-  {
-    // Matrices used/computed in this kernel
-    float32x2_t x[6][6], XTx[6][6], U[6][6];
-    for (int i = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++)
-      {
-        x[i][j] = vdup_n_f32(0.0f);
-        XTx[i][j] = vdup_n_f32(0.0f);
-      }
-    }
-
-    // Read a 6x6 tile in the Winograd domain
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = vld1_f32(x_ptrs[i][j]);
-        x_ptrs[i][j] += 2;
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
-
-      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
-
-      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
-
-      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
-
-      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
-
-      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
-
-      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
-
-      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
-
-      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        vst1_f32(outptr + m*matrix_stride, U[i][j]);
-      }
-    }
-    outptr += 2;
-  }
-#endif  // __arm_any__
-  for (; channels_remaining; channels_remaining--)
-  {
-    // Load x
-    for (int i = pad_top; i < cells_i; i++)
-    {
-      for (int j = pad_left; j < cells_j; j++)
-      {
-        x[i][j] = *(x_ptrs[i][j]++);
-      }
-    }
-
-    // Compute XT . x
-    for (int j = pad_left; j < cells_j; j++)
-    {
-      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
-      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
-      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
-      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
-      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
-      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
-    }
-
-    // Compute U = XT . x . X
-    for (int i = 0; i < 6; i++)
-    {
-      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
-      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
-      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
-      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
-      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
-      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
-    }
-
-    // Store the transformed matrix
-    for (int i = 0, m = 0; i < 6; i++)
-    {
-      for (int j = 0; j < 6; j++, m++)
-      {
-        *(outptr + m*matrix_stride) = U[i][j];
-      }
-    }
-    outptr++;
-  }
-}
-
-/* In the below, unusual or especially small tiles are routed via the slow
- * path whereas common or large tiles are routed through a faster path.
- */
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
-{
-  {
-    {
-      {
-        Transform::template process_tile<0, 0, 0, 0>,  // No padding
-        Transform::template process_tile<0, 0, 0, 1>,  // Right
-        Transform::template process_tile<0, 0, 0, 2>,  // "   "
-        Transform::template process_tile<0, 0, 0, 3>,  // "   "
-        Transform::template process_tile<0, 0, 0, 4>,  // "   "
-      },
-      {
-        Transform::template process_tile<0, 0, 1, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 1, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 1, 2>,  // "          "
-        Transform::template process_tile<0, 0, 1, 3>,  // "          "
-        Transform::template process_tile<0, 0, 1, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 2, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 2, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 2, 2>,  // "          "
-        Transform::template process_tile<0, 0, 2, 3>,  // "          "
-        Transform::template process_tile<0, 0, 2, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 3, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 3, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 3, 2>,  // "          "
-        Transform::template process_tile<0, 0, 3, 3>,  // "          "
-        Transform::template process_tile<0, 0, 3, 4>,  // "          "
-      },
-      {
-        Transform::template process_tile<0, 0, 4, 0>,  // Bottom
-        Transform::template process_tile<0, 0, 4, 1>,  // Bottom right
-        Transform::template process_tile<0, 0, 4, 2>,  // "          "
-        Transform::template process_tile<0, 0, 4, 3>,  // "          "
-        Transform::template process_tile<0, 0, 4, 4>,  // "          "
-      }
-    },
-    {
-      {
-        Transform::template process_tile<0, 1, 0, 0>,  // Left
-        Transform::template process_tile<0, 1, 0, 1>,
-        Transform::template process_tile<0, 1, 0, 2>,
-        Transform::template process_tile<0, 1, 0, 3>,
-        Transform::template process_tile<0, 1, 0, 4>,
-      },
-      {
-        Transform::template process_tile<0, 1, 1, 0>,  // Bottom left
-        Transform::template process_tile<0, 1, 1, 1>,
-        Transform::template process_tile<0, 1, 1, 2>,
-        Transform::template process_tile<0, 1, 1, 3>,
-        Transform::template process_tile<0, 1, 1, 4>,
-      },
-      {
-        Transform::template process_tile<0, 1, 2, 0>,  // "          "
-        Transform::template process_tile<0, 1, 2, 1>,
-        Transform::template process_tile<0, 1, 2, 2>,
-        Transform::template process_tile<0, 1, 2, 3>,
-        Transform::template process_tile<0, 1, 2, 4>,
-      },
-      {
-        Transform::template process_tile<0, 1, 3, 0>,  // "          "
-        Transform::template process_tile<0, 1, 3, 1>,
-        Transform::template process_tile<0, 1, 3, 2>,
-        Transform::template process_tile<0, 1, 3, 3>,
-        Transform::template process_tile<0, 1, 3, 4>,
-      },
-      {
-        Transform::template process_tile<0, 1, 4, 0>,  // "          "
-        Transform::template process_tile<0, 1, 4, 1>,
-        Transform::template process_tile<0, 1, 4, 2>,
-        Transform::template process_tile<0, 1, 4, 3>,
-        Transform::template process_tile<0, 1, 4, 4>,
-      }
-    }
-  },
-  {
-    {
-      {
-        Transform::template process_tile<1, 0, 0, 0>,  // Top
-        Transform::template process_tile<1, 0, 0, 1>,  // Top right
-        Transform::template process_tile<1, 0, 0, 2>,  // "       "
-        Transform::template process_tile<1, 0, 0, 3>,  // "       "
-        Transform::template process_tile<1, 0, 0, 4>,  // "       "
-      },
-      {
-        Transform::template process_tile<1, 0, 1, 0>,
-        Transform::template process_tile<1, 0, 1, 1>,
-        Transform::template process_tile<1, 0, 1, 2>,
-        Transform::template process_tile<1, 0, 1, 3>,
-        Transform::template process_tile<1, 0, 1, 4>,
-      },
-      {
-        Transform::template process_tile<1, 0, 2, 0>,
-        Transform::template process_tile<1, 0, 2, 1>,
-        Transform::template process_tile<1, 0, 2, 2>,
-        Transform::template process_tile<1, 0, 2, 3>,
-        Transform::template process_tile<1, 0, 2, 4>,
-      },
-      {
-        Transform::template process_tile<1, 0, 3, 0>,
-        Transform::template process_tile<1, 0, 3, 1>,
-        Transform::template process_tile<1, 0, 3, 2>,
-        Transform::template process_tile<1, 0, 3, 3>,
-        Transform::template process_tile<1, 0, 3, 4>,
-      },
-      {
-        Transform::template process_tile<1, 0, 4, 0>,
-        Transform::template process_tile<1, 0, 4, 1>,
-        Transform::template process_tile<1, 0, 4, 2>,
-        Transform::template process_tile<1, 0, 4, 3>,
-        Transform::template process_tile<1, 0, 4, 4>,
-      },
-    },
-    {
-      {
-        Transform::template process_tile<1, 1, 0, 0>,  // Top left
-        Transform::template process_tile<1, 1, 0, 1>,
-        Transform::template process_tile<1, 1, 0, 2>,
-        Transform::template process_tile<1, 1, 0, 3>,
-        Transform::template process_tile<1, 1, 0, 4>,
-      },
-      {
-        Transform::template process_tile<1, 1, 1, 0>,
-        Transform::template process_tile<1, 1, 1, 1>,
-        Transform::template process_tile<1, 1, 1, 2>,
-        Transform::template process_tile<1, 1, 1, 3>,
-        Transform::template process_tile<1, 1, 1, 4>,
-      },
-      {
-        Transform::template process_tile<1, 1, 2, 0>,
-        Transform::template process_tile<1, 1, 2, 1>,
-        Transform::template process_tile<1, 1, 2, 2>,
-        Transform::template process_tile<1, 1, 2, 3>,
-        Transform::template process_tile<1, 1, 2, 4>,
-      },
-      {
-        Transform::template process_tile<1, 1, 3, 0>,
-        Transform::template process_tile<1, 1, 3, 1>,
-        Transform::template process_tile<1, 1, 3, 2>,
-        Transform::template process_tile<1, 1, 3, 3>,
-        Transform::template process_tile<1, 1, 3, 4>,
-      },
-      {
-        Transform::template process_tile<1, 1, 4, 0>,
-        Transform::template process_tile<1, 1, 4, 1>,
-        Transform::template process_tile<1, 1, 4, 2>,
-        Transform::template process_tile<1, 1, 4, 3>,
-        Transform::template process_tile<1, 1, 4, 4>,
-      }
-    }
-  }
-};
-
-template struct WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
-}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp
new file mode 100644
index 0000000..893122c
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp

@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0>
+void winograd_input_transform_6x6_fp32_process_tile(
+  int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* const matrix_base,
+const int matrix_stride,
+     const int _pad_top,
+     const int _pad_left,
+     const int _pad_bottom,
+     const int _pad_right
+)
+{
+  const int pad_top = Specialized ? PadTop : _pad_top;
+  const int pad_left = Specialized ? PadLeft : _pad_left;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+ constexpr int inner_tile_rows = 6;
+  constexpr int inner_tile_cols = 6;
+
+  const int cells_i = inner_tile_rows - pad_bottom;
+  const int cells_j = inner_tile_cols - pad_right;
+
+  float *outptr = matrix_base;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_rows][inner_tile_cols];
+  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_rows][inner_tile_cols];
+  float XTx[inner_tile_rows][inner_tile_cols];
+  float U[inner_tile_rows][inner_tile_cols];
+  for (int i = 0; i < inner_tile_rows; i++)
+  {
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used/computed in this kernel
+    float32x4_t x[inner_tile_rows][inner_tile_cols];
+    float32x4_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x4_t U[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdupq_n_f32(0.0f);
+        XTx[i][j] = vdupq_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1q_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 4;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 4;
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel
+    float32x2_t x[inner_tile_rows][inner_tile_cols];
+    float32x2_t XTx[inner_tile_rows][inner_tile_cols];
+    float32x2_t U[inner_tile_rows][inner_tile_cols];
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_rows; i++)
+    {
+      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_rows; i++)
+    {
+      for (int j = 0; j < inner_tile_cols; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+}
+
+namespace winograd
+{
+template <int k>
+using Tiles = InputTransformImplTiles<k, k, 6, 6, float>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>;
+
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>,
+};
+
+template <>
+const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = {
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>,
+  winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>,
+};
+
+template class InputTransform<3, 3, 6, 6, float>;
+template class InputTransform<5, 5, 6, 6, float>;
+}

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp
new file mode 100644
index 0000000..ea842a4
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp

@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_2_7_fp32_process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  const float* const biases,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
+)
+{
+  (void) output_row_stride;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 2;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
+
+
+  // Construct a map to the output cells
+  float *outptrs[cells_j];
+  for (int j = 0; j < cells_j; j++)
+  {
+    outptrs[j] = output + j*output_col_stride;
+  }
+  const float *inptr = matrix_base;
+  const float *bptr = biases;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __arm_any__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1q_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 4;
+    }
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 2;
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+    inptr++;
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      *(outptrs[j]++) = f[j] + b;
+    }
+  }
+}
+}  // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2_7_fp32_process_tile<true, 1>
+};
+
+template class OutputTransform<1, 7, 1, 8, float>;
+template class OutputTransform<7, 1, 8, 1, float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
index 3b3cda0..597b074 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp

@@ -23,59 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(shape.n_rows, 2);
-  const int tile_N = iceildiv(shape.n_cols, 2);
-  return 24 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- *      ___     ___
- *     |   |   |  X|
- *     |___|   |__X|
- *
- *      ___     ___
- *     |   |   |  X|
- *     |X_X|   |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_3x3_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 2 - pad_bottom;
-  constexpr int cells_j = 2 - pad_right;
+  constexpr int OutputTileRows = 2, OutputTileCols = 2;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+  const int cells_i = OutputTileRows - pad_bottom;
+  const int cells_j = OutputTileCols - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[OutputTileRows][OutputTileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -373,19 +348,28 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,  // No padding
-    Transform::template process_tile<0, 1>,  // Right padding
-  },
-  {
-    Transform::template process_tile<1, 0>,  // Bottom padding
-    Transform::template process_tile<1, 1>,  // Bottom and right padding
-  }
+using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 1, 0>
 };
 
-template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2x2_3x3_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<3, 3, 4, 4, float>;
 }  // namespace winograd
+

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
index 8668535..60d7181 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp

@@ -23,57 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  (void) shape;
-  return 0;
-}
-
-/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain 0 or 1
- * padded values to the right and bottom columns or rows of the tile, e.g.:
- *
- *      ___     ___
- *     |   |   |  X|
- *     |___|   |__X|
- *
- *      ___     ___
- *     |   |   |  X|
- *     |X_X|   |X_X|
- *
- *
- * We provide a specialised output transform for each of these instances.
- * Consequently we below construct an array of the various padding options, the
- * array contains pointers to the specific implementations.
- */
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_2x2_5x5_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 2 - pad_bottom;
-  constexpr int cells_j = 2 - pad_right;
+  constexpr int OutputTileRows = 2, OutputTileCols = 2;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+
+  const int cells_i = 2 - pad_bottom;
+  const int cells_j = 2 - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[OutputTileRows][OutputTileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -365,19 +342,28 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,  // No padding
-    Transform::template process_tile<0, 1>,  // Right padding
-  },
-  {
-    Transform::template process_tile<1, 0>,  // Bottom padding
-    Transform::template process_tile<1, 1>,  // Bottom and right padding
-  }
+using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 1, 0>
 };
 
-template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_2x2_5x5_fp32_process_tile<true, 0, 1>
+};
+
+template class OutputTransform<5, 5, 6, 6, float>;
 }  // namespace winograd
+

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp
new file mode 100644
index 0000000..911759b
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp

@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_4_5_fp32_process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  const float* const biases,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
+)
+{
+  (void) output_row_stride;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 4;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
+
+  // Construct a map to the output cells
+  float *outptrs[cells_j];
+  for (int j = 0; j < cells_j; j++)
+  {
+    outptrs[j] = output + j*output_col_stride;
+  }
+  const float *inptr = matrix_base;
+  const float *bptr = biases;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __arm_any__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1q_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 4;
+    }
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 2;
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+    inptr++;
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      *(outptrs[j]++) = f[j] + b;
+    }
+  }
+}
+
+}  // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_4_5_fp32_process_tile<true, 1>,
+  winograd_output_transform_4_5_fp32_process_tile<true, 2>,
+  winograd_output_transform_4_5_fp32_process_tile<true, 3>
+};
+
+template class OutputTransform<1, 5, 1, 8, float>;
+template class OutputTransform<5, 1, 8, 1, float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
index cd3bdef..15cc04b 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp

@@ -23,73 +23,34 @@
  */
 
 #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
 
-namespace winograd
+namespace
 {
 
-using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
-
-template <>
-template <>
-int Transform::ops_performed(const Tensor4DShape &shape)
-{
-  // NOTE: Cost in FLOPs rather than instructions or uops.
-  const int tile_M = iceildiv(shape.n_rows, 4);
-  const int tile_N = iceildiv(shape.n_cols, 4);
-  return 170 * tile_M * tile_N * shape.n_channels;
-}
-
-/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
- * enough tiles to cover the output space each output tile may contain up to 3
- * padded values to the right and bottom columns or rows of the tile, e.g.:
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |_______|   |______X|  |____X_X|  |__X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*      ________    ________   ________   ________
-*     |       |   |      X|  |    X X|  |  X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
-*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
-*
-*
-* We provide a specialised output transform for each of these instances.
-*/
-template <>
-template <>
-template <int pad_bottom, int pad_right>
-void Transform::process_tile(
+template <bool Specialized, int PadBottom=0, int PadRight=0>
+void winograd_output_transform_4x4_3x3_fp32_process_tile(
   const int n_channels,
   const float* const matrix_base,
   const int matrix_stride,
   const float* const biases,
   float* const output,
   const int output_row_stride,
-  const int output_col_stride
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
 )
 {
-  constexpr int cells_i = 4 - pad_bottom;
-  constexpr int cells_j = 4 - pad_right;
+  const int pad_bottom = Specialized ? PadBottom : _pad_bottom;
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  constexpr int TileRows = 4, TileCols = 4;
+
+  const int cells_i = TileRows - pad_bottom;
+  const int cells_j = TileCols - pad_right;
 
   // Construct a map to the output cells
-  float *outptrs[cells_i][cells_j];
+  float *outptrs[TileRows][TileCols];
   for (int i = 0; i < cells_i; i++)
   {
     for (int j = 0; j < cells_j; j++)
@@ -437,35 +398,31 @@
   }
 }
 
-template <>
-template <>
-const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+}  // namespace (anonymous)
+
+namespace winograd
 {
-  {
-    Transform::template process_tile<0, 0>,
-    Transform::template process_tile<0, 1>,
-    Transform::template process_tile<0, 2>,
-    Transform::template process_tile<0, 3>,
-  },
-  {
-    Transform::template process_tile<1, 0>,
-    Transform::template process_tile<1, 1>,
-    Transform::template process_tile<1, 2>,
-    Transform::template process_tile<1, 3>,
-  },
-  {
-    Transform::template process_tile<2, 0>,
-    Transform::template process_tile<2, 1>,
-    Transform::template process_tile<2, 2>,
-    Transform::template process_tile<2, 3>,
-  },
-  {
-    Transform::template process_tile<3, 0>,
-    Transform::template process_tile<3, 1>,
-    Transform::template process_tile<3, 2>,
-    Transform::template process_tile<3, 3>,
-  }
+using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile<false>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = {
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 1, 0>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 2, 0>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 3, 0>,
 };
 
-template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 1>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 2>,
+  winograd_output_transform_4x4_3x3_fp32_process_tile<true, 0, 3>,
+};
+
+template class OutputTransform<3, 3, 6, 6, float>;
 }  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp
new file mode 100644
index 0000000..58bed71
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp

@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+
+namespace
+{
+
+template <bool Specialized, int PadRight=0>
+void winograd_output_transform_6_3_fp32_process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  const float* const biases,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride,
+  const int _pad_bottom,
+  const int _pad_right
+)
+{
+  (void) output_row_stride;
+  (void) _pad_bottom;
+  constexpr int output_tile_cols = 6;
+  constexpr int inner_tile_cols = 8;
+
+  const int pad_right = Specialized ? PadRight : _pad_right;
+  const int cells_j = output_tile_cols - pad_right;
+
+  // Construct a map to the output cells
+  float *outptrs[cells_j];
+  for (int j = 0; j < cells_j; j++)
+  {
+    outptrs[j] = output + j*output_col_stride;
+  }
+  const float *inptr = matrix_base;
+  const float *bptr = biases;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __arm_any__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[inner_tile_cols], f[output_tile_cols], b = vdupq_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1q_f32(inptr + j*matrix_stride);
+    }
+    inptr += 4;
+
+    f[0] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+    f[4] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+    f[5] = vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmlaq_n_f32(vmulq_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1q_f32(bptr);
+      bptr += 4;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1q_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 4;
+    }
+  }
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[inner_tile_cols], f[output_tile_cols], b = vdup_n_f32(0.0f);
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = vld1_f32(inptr + j*matrix_stride);
+    }
+    inptr += 2;
+
+    f[0] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[6], 1), F[5], 1), F[4], 1), F[3], 1), F[2], 1), F[1], 1), F[0], 1);
+    f[1] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 3), F[4], 2), F[3], -2), F[5], -3), F[1], -1);
+    f[2] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 9), F[5], 9), F[4], 4), F[3], 4);
+    f[3] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[6], 27), F[4], 8), F[3], -8), F[5], -27), F[1], -1);
+    f[4] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[2], 1), F[1], 1), F[6], 81), F[5], 81), F[4], 16), F[3], 16);
+    f[5] = vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmla_n_f32(vmul_n_f32(F[7], 1), F[2], 1), F[6], 243), F[4], 32), F[3], -32), F[5], -243), F[1], -1);
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = vld1_f32(bptr);
+      bptr += 2;
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      vst1_f32(outptrs[j], f[j] + b);
+      outptrs[j] += 2;
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[inner_tile_cols], f[output_tile_cols], b = 0.0f;
+
+    // Read a 1x8 tile in the Winograd domain
+    for (int j = 0; j < inner_tile_cols; j++)
+    {
+      F[j] = *(inptr + j*matrix_stride);
+    }
+    inptr++;
+
+    f[0] = F[0]*1 + F[1]*1 + F[2]*1 + F[3]*1 + F[4]*1 + F[5]*1 + F[6]*1;
+    f[1] = F[1]*-1 + F[5]*-3 + F[3]*-2 + F[4]*2 + F[6]*3 + F[2]*1;
+    f[2] = F[3]*4 + F[4]*4 + F[5]*9 + F[6]*9 + F[1]*1 + F[2]*1;
+    f[3] = F[1]*-1 + F[5]*-27 + F[3]*-8 + F[4]*8 + F[6]*27 + F[2]*1;
+    f[4] = F[3]*16 + F[4]*16 + F[5]*81 + F[6]*81 + F[1]*1 + F[2]*1;
+    f[5] = F[1]*-1 + F[5]*-243 + F[3]*-32 + F[4]*32 + F[6]*243 + F[2]*1 + F[7]*1;
+
+    // Write out the output tile
+    if (bptr != 0)
+    {
+      b = *(bptr++);
+    }
+    for (int j = 0; j < cells_j; j++)
+    {
+      *(outptrs[j]++) = f[j] + b;
+    }
+  }
+}
+
+}  // namespace (anonymous)
+
+namespace winograd
+{
+using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile<true>;
+
+template <>
+const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = {
+  winograd_output_transform_6_3_fp32_process_tile<true, 1>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 2>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 3>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 4>,
+  winograd_output_transform_6_3_fp32_process_tile<true, 5>,
+};
+
+template class OutputTransform<1, 3, 1, 8, float>;
+template class OutputTransform<3, 1, 8, 1, float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp
new file mode 100644
index 0000000..85cf418
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2_7_fp32.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+namespace winograd
+{
+  template <>
+  template <>
+  void WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const float *inptrs[kernel_cols];
+    for (int j = 0; j < kernel_cols; j++)
+    {
+      inptrs[j] = input + j*weight_col_stride;
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[kernel_cols], V[inner_tile_cols];
+
+        // Read weights
+        for (int j = 0; j < kernel_cols; j++)
+        {
+          w[j] = *(inptrs[j]++);
+        }
+
+        // Compute V = w WT
+        V[0] = (w[0]*-1) / 36.0f;
+        V[1] = (w[1]*-1 + w[3]*-1 + w[5]*-1 + w[0]*1 + w[2]*1 + w[4]*1 + w[6]*1) / 48.0f;
+        V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1 + w[5]*1 + w[6]*1) / 48.0f;
+        V[3] = (w[0]*-1 + w[6]*-64 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8 + w[5]*32) / 120.0f;
+        V[4] = (w[0]*-1 + w[6]*-64 + w[5]*-32 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120.0f;
+        V[5] = (w[5]*-243 + w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[6]*729 + w[0]*1) / 720.0f;
+        V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[5]*243 + w[6]*729 + w[0]*1) / 720.0f;
+        V[7] = (w[6]*1) / 1.0f;
+
+        // Store the transformed weights
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+          *(outptr + j*matrix_stride) = V[j];
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template <>
+  template <>
+  void WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Redirect to the 1xN implementation
+    WinogradGEMM<1, 2, 1, 7>::template WeightsTransform<float>::execute(
+      n_output_channels, n_input_channels, input, output, matrix_stride,
+      matrix_row_stride
+    );
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template struct WinogradGEMM<1, 2, 1, 7>::WeightsTransform<float>;
+  template struct WinogradGEMM<2, 1, 7, 1>::WeightsTransform<float>;
+}

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
index 76393c1..2f4f6e1 100644
--- a/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_2x2_5x5_fp32.cpp

@@ -401,7 +401,7 @@
   template <>
   int WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
   {
-    return 0;
+    return 0;  // TODO
   }
 
   template class WinogradGEMM<2, 2, 5, 5>::WeightsTransform<float>;

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp
new file mode 100644
index 0000000..2f14e20
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_4_5_fp32.cpp

@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+namespace winograd
+{
+  template <>
+  template <>
+  void WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const float *inptrs[kernel_cols];
+    for (int j = 0; j < kernel_cols; j++)
+    {
+      inptrs[j] = input + j*weight_col_stride;
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[kernel_cols], V[inner_tile_cols];
+
+        // Read weights
+        for (int j = 0; j < kernel_cols; j++)
+        {
+          w[j] = *(inptrs[j]++);
+        }
+
+        // Compute V = w WT
+        V[0] = (w[0]*-1) / 36;
+        V[1] = (w[1]*-1 + w[3]*-1 + w[0]*1 + w[2]*1 + w[4]*1) / 48;
+        V[2] = (w[0]*1 + w[1]*1 + w[2]*1 + w[3]*1 + w[4]*1) / 48;
+        V[3] = (w[0]*-1 + w[4]*-16 + w[2]*-4 + w[1]*2 + w[3]*8) / 120;
+        V[4] = (w[0]*-1 + w[4]*-16 + w[3]*-8 + w[2]*-4 + w[1]*-2) / 120;
+        V[5] = (w[3]*-27 + w[1]*-3 + w[2]*9 + w[4]*81 + w[0]*1) / 720;
+        V[6] = (w[1]*3 + w[2]*9 + w[3]*27 + w[4]*81 + w[0]*1) / 720;
+        V[7] = (w[4]*1) / 1;
+
+        // Store the transformed weights
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+          *(outptr + j*matrix_stride) = V[j];
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template <>
+  template <>
+  void WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Redirect to the 1xN implementation
+    WinogradGEMM<1, 4, 1, 5>::template WeightsTransform<float>::execute(
+      n_output_channels, n_input_channels, input, output, matrix_stride,
+      matrix_row_stride
+    );
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template struct WinogradGEMM<1, 4, 1, 5>::WeightsTransform<float>;
+  template struct WinogradGEMM<4, 1, 5, 1>::WeightsTransform<float>;
+}

diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp
new file mode 100644
index 0000000..c560aa8
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/transforms/weights_6_3_fp32.cpp

@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/kernel.hpp"
+
+
+namespace winograd
+{
+  template <>
+  template <>
+  void WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const float *inptrs[3];
+    for (int j = 0; j < 3; j++)
+    {
+      inptrs[j] = input + j*weight_col_stride;
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[3], V[inner_tile_cols];
+
+        // Read weights
+        for (int j = 0; j < 3; j++)
+        {
+          w[j] = *(inptrs[j]++);
+        }
+
+        // Compute V = w WT
+        V[0] = (w[0]*-1) / 36.0f;
+        V[1] = (w[1]*-1 + w[0]*1 + w[2]*1) / 48.0f;
+        V[2] = (w[0]*1 + w[1]*1 + w[2]*1) / 48.0f;
+        V[3] = (w[0]*-1 + w[2]*-4 + w[1]*2) / 120.0f;
+        V[4] = (w[0]*-1 + w[2]*-4 + w[1]*-2) / 120.0f;
+        V[5] = (w[1]*-3 + w[2]*9 + w[0]*1) / 720.0f;
+        V[6] = (w[1]*3 + w[2]*9 + w[0]*1) / 720.0f;
+        V[7] = (w[2]*1) / 1;
+
+        // Store the transformed weights
+        for (int j = 0; j < inner_tile_cols; j++)
+        {
+          *(outptr + j*matrix_stride) = V[j];
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template <>
+  template <>
+  void WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Redirect to the 1xN implementation
+    WinogradGEMM<1, 6, 1, 3>::template WeightsTransform<float>::execute(
+      n_output_channels, n_input_channels, input, output, matrix_stride,
+      matrix_row_stride
+    );
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    (void) shape;
+    return 0;  // TODO
+  }
+
+  template struct WinogradGEMM<1, 6, 1, 3>::WeightsTransform<float>;
+  template struct WinogradGEMM<6, 1, 3, 1>::WeightsTransform<float>;
+}

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index a5d4302..a7de2fd 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp

@@ -225,4 +225,16 @@
 template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
 template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;
 
+template class WinogradGEMM<1, 6, 1, 3>::Convolution<float, float>;
+template class WinogradGEMM<6, 1, 3, 1>::Convolution<float, float>;
+
 template class WinogradGEMM<2, 2, 5, 5>::Convolution<float, float>;
+
+template class WinogradGEMM<1, 4, 1, 5>::Convolution<float, float>;
+template class WinogradGEMM<4, 1, 5, 1>::Convolution<float, float>;
+
+template class WinogradGEMM<1, 2, 1, 7>::Convolution<float, float>;
+template class WinogradGEMM<2, 1, 7, 1>::Convolution<float, float>;
+
+
+

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 11bdbda..39dad8f 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -34,7 +34,7 @@
 #include <string>
 
 using namespace arm_compute;
-
+#ifndef DOXYGEN_SKIP_THIS
 std::string arm_compute::build_information()
 {
     static const std::string information =
@@ -42,7 +42,7 @@
         ;
     return information;
 }
-
+#endif /* DOXYGEN_SKIP_THIS */
 std::string arm_compute::read_file(const std::string &filename, bool binary)
 {
     std::string   out;
@@ -252,6 +252,68 @@
     return pool_type_map[type];
 }
 
+const std::string &arm_compute::string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
+{
+    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
+    {
+        { GEMMLowpOutputStageType::NONE, "" },
+        { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
+        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
+        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
+    };
+
+    return output_stage_map[output_stage];
+}
+
+std::string arm_compute::string_from_pixel_value(const PixelValue &value, const DataType data_type)
+{
+    std::stringstream ss;
+    std::string       converted_string;
+
+    switch(data_type)
+    {
+        case DataType::U8:
+        case DataType::QASYMM8:
+            // Needs conversion to 32 bit, otherwise interpreted as ASCII values
+            ss << uint32_t(value.get<uint8_t>());
+            converted_string = ss.str();
+            break;
+        case DataType::S8:
+            // Needs conversion to 32 bit, otherwise interpreted as ASCII values
+            ss << int32_t(value.get<int8_t>());
+            converted_string = ss.str();
+            break;
+        case DataType::U16:
+            ss << value.get<uint16_t>();
+            converted_string = ss.str();
+            break;
+        case DataType::S16:
+            ss << value.get<int16_t>();
+            converted_string = ss.str();
+            break;
+        case DataType::U32:
+            ss << value.get<uint32_t>();
+            converted_string = ss.str();
+            break;
+        case DataType::S32:
+            ss << value.get<int32_t>();
+            converted_string = ss.str();
+            break;
+        case DataType::F32:
+            converted_string = float_to_string_with_full_precision(value.get<float>());
+            break;
+        case DataType::F16:
+            static_assert(sizeof(half) == 2, "Half must be 16 bit");
+            ss << value.get<half>();
+            converted_string = ss.str();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not handled");
+    }
+
+    return converted_string;
+}
+
 std::string arm_compute::lower_string(const std::string &val)
 {
     std::string res = val;
@@ -274,28 +336,16 @@
     return PadStrideInfo(strides.first, strides.second, same_pad_left, same_pad_right, same_pad_top, same_pad_bottom, DimensionRoundingType::CEIL);
 }
 
-TensorShape arm_compute::deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights)
-{
-    TensorShape out_shape(input);
-    out_shape.set(0, out_dims.first);
-    out_shape.set(1, out_dims.second);
-    out_shape.set(2, weights[3]);
-    return out_shape;
-}
-
 const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
     unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
-    unsigned int inner_border_right, unsigned int inner_border_top, unsigned int stride_x, unsigned int stride_y)
+    unsigned int stride_x, unsigned int stride_y)
 {
     ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
-    ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width + inner_border_right) < 2 * padx);
-    ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height + inner_border_top) < 2 * pady);
-    const int padx_deconv = (kernel_width - padx - 1);
-    const int pady_deconv = (kernel_height - pady - 1);
-    ARM_COMPUTE_ERROR_ON(padx_deconv < 0);
-    ARM_COMPUTE_ERROR_ON(pady_deconv < 0);
-    const int w = stride_x * (in_width - 1) + kernel_width + inner_border_right - 2 * padx_deconv;
-    const int h = stride_y * (in_height - 1) + kernel_height + inner_border_top - 2 * pady_deconv;
+    ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width) < 2 * padx);
+    ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height) < 2 * pady);
+    const int w = stride_x * (in_width - 1) + kernel_width - 2 * padx;
+    const int h = stride_y * (in_height - 1) + kernel_height - 2 * pady;
+
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 

diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
new file mode 100644
index 0000000..a4bce5d
--- /dev/null
+++ b/src/core/utils/helpers/tensor_transform.cpp

@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+
+namespace arm_compute
+{
+namespace helpers
+{
+namespace tensor_transform
+{
+Coordinates slice_absolute_end_coords(TensorShape input_shape, Coordinates ends)
+{
+    // Create end mask
+    int32_t end_mask = 0;
+    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    {
+        if(ends[i] < 0)
+        {
+            end_mask |= 1 << i;
+        }
+    }
+    // Get unit strides
+    const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
+
+    return strided_slice_absolute_end_coords(input_shape, Coordinates(), ends, unit_strides, end_mask);
+}
+
+TensorShape compute_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends_abs)
+{
+    // Get unit strides
+    const BiStrides unit_strides = strided_slice_strides(input_shape, BiStrides());
+    return compute_strided_slice_output_shape(input_shape, starts, ends_abs, unit_strides);
+}
+
+Coordinates strided_slice_absolute_start_coords(TensorShape input_shape, Coordinates starts, Coordinates strides, int32_t begin_mask)
+{
+    Coordinates starts_abs;
+    for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
+    {
+        // Get start index
+        int start_i = starts[i];
+
+        // Reset in case of begin mask present
+        if((begin_mask & 1 << i) != 0)
+        {
+            start_i = strides[i] > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
+        }
+
+        // Account negative start points
+        const int dim_size = input_shape[i];
+        if(start_i < 0)
+        {
+            start_i += dim_size;
+        }
+
+        // Final clamp
+        start_i = utility::clamp(start_i, 0, dim_size - 1);
+        starts_abs.set(i, start_i);
+    }
+
+    // Fill remaining
+    for(unsigned int i = starts_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+    {
+        starts_abs.set(i, 0);
+    }
+
+    return starts_abs;
+}
+
+Coordinates strided_slice_absolute_end_coords(TensorShape input_shape, Coordinates starts_abs, Coordinates ends, Coordinates strides,
+                                              int32_t end_mask, int32_t shrink_axis_mask)
+{
+    Coordinates ends_abs;
+    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    {
+        // Get end index
+        int stop_i = ends[i];
+
+        // Shrink dimension
+        if((shrink_axis_mask & (1 << i)) != 0)
+        {
+            stop_i = starts_abs[i] + 1;
+        }
+
+        // Reset in case of begin mask present
+        if((end_mask & 1 << i) != 0)
+        {
+            stop_i = (strides[i] > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
+        }
+
+        // Account negative end points
+        const int dim_size = input_shape[i];
+        if(stop_i < 0)
+        {
+            stop_i += dim_size;
+        }
+
+        // Final clamp
+        stop_i = (strides[i] > 0) ? utility::clamp(stop_i, 0, dim_size) : utility::clamp(stop_i, -1, dim_size - 1);
+        ends_abs.set(i, stop_i);
+    }
+
+    // Fill remaining ends
+    for(unsigned int i = ends_abs.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+    {
+        ends_abs.set(i, input_shape[i]);
+    }
+
+    return ends_abs;
+}
+
+Coordinates strided_slice_strides(TensorShape input_shape, Coordinates strides)
+{
+    for(unsigned int i = strides.num_dimensions(); i < input_shape.num_dimensions(); ++i)
+    {
+        strides.set(i, 1);
+    }
+    return strides;
+}
+
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts_abs, Coordinates ends_abs, Coordinates final_strides)
+{
+    TensorShape output_shape = input_shape;
+    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    {
+        const int stride_i = final_strides[i];
+        const int range    = ends_abs[i] - starts_abs[i];
+        if((range == 0) ||                 // Zero range
+           (range < 0 && stride_i >= 0) || // Negative range with positive stride
+           (range > 0 && stride_i <= 0))   // Positive range with negative stride
+        {
+            output_shape.set(i, 0);
+            return output_shape;
+        }
+        else
+        {
+            int dim = range / stride_i + (range % stride_i != 0 ? 1 : 0);
+            output_shape.set(i, dim);
+        }
+    }
+    return output_shape;
+}
+} // namespace tensor_transform
+} // namespace helpers
+} // namespace arm_compute

diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 8bb6d8e..ea9ba77 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,21 +30,30 @@
 using namespace arm_compute::quantization;
 
 constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+constexpr float   epsilon            = 0.00001f;
 
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(float multiplier,
                                                                                             int   *quant_multiplier,
                                                                                             int   *right_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier >= 1);
-    if(multiplier == 0)
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < -epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f + epsilon);
+    if(std::fabs(1.0f - multiplier) < epsilon)
+    {
+        *quant_multiplier = 1;
+        *right_shift      = 0;
+        return arm_compute::Status{};
+    }
+
+    if(std::fabs(0.0f - multiplier) < epsilon)
     {
         *quant_multiplier = 0;
         *right_shift      = 0;
         return arm_compute::Status{};
     }
+
     const double q = std::frexp(multiplier, right_shift);
     *right_shift *= -1;
     auto q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
@@ -61,7 +70,7 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(float multiplier,
                                                                                                int   *quantized_multiplier,
                                                                                                int   *left_shift)
 {

diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 81a18c4..b2ca28d 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp

@@ -132,7 +132,7 @@
     TensorDescriptor common_desc = input_tensor_desc;
     common_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
 
-    // Create mean and nodes
+    // Create mean and var nodes
     auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
     auto var_nid  = add_const_node_with_name(g, params, "Variance", common_desc, std::move(var_accessor));
 
@@ -168,6 +168,20 @@
     return batch_norm_nid;
 }
 
+NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+    CHECK_NODEIDX_PAIR(deltas, g);
+
+    NodeID nid = g.add_node<BoundingBoxTransformLayerNode>(info);
+
+    g.add_connection(input.node_id, input.index, nid, 0);
+    g.add_connection(deltas.node_id, deltas.index, nid, 1);
+
+    set_node_params(g, nid, params);
+    return nid;
+}
+
 NodeID GraphBuilder::add_channel_shuffle_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_groups)
 {
     return create_simple_single_input_output_node<ChannelShuffleLayerNode>(g, params, input, num_groups);
@@ -327,7 +341,13 @@
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
-        b_nid                   = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
+
+        if(is_data_type_quantized_asymmetric(b_desc.data_type))
+        {
+            b_desc.data_type = DataType::S32;
+        }
+
+        b_nid = add_const_node_with_name(g, params, "Bias", b_desc, std::move(bias_accessor));
     }
 
     // Create convolution node and connect
@@ -412,11 +432,58 @@
     return fc_nid;
 }
 
+NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
+{
+    CHECK_NODEIDX_PAIR(scores, g);
+    CHECK_NODEIDX_PAIR(deltas, g);
+    CHECK_NODEIDX_PAIR(anchors, g);
+
+    NodeID nid = g.add_node<GenerateProposalsLayerNode>(info);
+
+    g.add_connection(scores.node_id, scores.index, nid, 0);
+    g.add_connection(deltas.node_id, deltas.index, nid, 1);
+    g.add_connection(anchors.node_id, anchors.index, nid, 2);
+
+    set_node_params(g, nid, params);
+    return nid;
+}
+
 NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
 {
     return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
 }
 
+NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
+                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+
+    // Get input tensor descriptor
+    const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
+
+    // Calculate Common Descriptor
+    TensorDescriptor common_desc = input_tensor_desc;
+    common_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL));
+
+    // Create mean and std nodes
+    auto mean_nid = add_const_node_with_name(g, params, "Mean", common_desc, std::move(mean_accessor));
+    auto std_nid  = add_const_node_with_name(g, params, "Std", common_desc, std::move(std_accessor));
+
+    // Create normalize planar YUV node and add connections
+    NodeID norm_planar_yuv_nid = g.add_node<NormalizePlanarYUVLayerNode>();
+    g.add_connection(input.node_id, input.index, norm_planar_yuv_nid, 0);
+    g.add_connection(mean_nid, 0, norm_planar_yuv_nid, 1);
+    g.add_connection(std_nid, 0, norm_planar_yuv_nid, 2);
+    set_node_params(g, norm_planar_yuv_nid, params);
+
+    return norm_planar_yuv_nid;
+}
+
+NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, PaddingList padding)
+{
+    return create_simple_single_input_output_node<PadLayerNode>(g, params, input, padding);
+}
+
 NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
 {
     return create_simple_single_input_output_node<PermuteLayerNode>(g, params, input, perm, layout);
@@ -427,6 +494,26 @@
     return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
 }
 
+NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, PriorBoxLayerInfo prior_info)
+{
+    CHECK_NODEIDX_PAIR(input0, g);
+    CHECK_NODEIDX_PAIR(input1, g);
+
+    // Create priorbox node and connect
+    NodeID prior_nid = g.add_node<PriorBoxLayerNode>(prior_info);
+    g.add_connection(input0.node_id, input0.index, prior_nid, 0);
+    g.add_connection(input1.node_id, input1.index, prior_nid, 1);
+
+    set_node_params(g, prior_nid, params);
+
+    return prior_nid;
+}
+
+NodeID GraphBuilder::add_reorg_node(Graph &g, NodeParams params, NodeIdxPair input, int stride)
+{
+    return create_simple_single_input_output_node<ReorgLayerNode>(g, params, input, stride);
+}
+
 NodeID GraphBuilder::add_reshape_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
 {
     return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
@@ -438,6 +525,20 @@
     return create_simple_single_input_output_node<ResizeLayerNode>(g, params, input, policy, width_scale, height_scale);
 }
 
+NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
+{
+    CHECK_NODEIDX_PAIR(input, g);
+    CHECK_NODEIDX_PAIR(rois, g);
+
+    NodeID nid = g.add_node<ROIAlignLayerNode>(pool_info);
+
+    g.add_connection(input.node_id, input.index, nid, 0);
+    g.add_connection(rois.node_id, rois.index, nid, 1);
+
+    set_node_params(g, nid, params);
+    return nid;
+}
+
 NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
 {
     CHECK_NODEIDX_PAIR(input, g);
@@ -472,9 +573,24 @@
     return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
 }
 
+NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
+{
+    return create_simple_single_input_output_node<SliceLayerNode>(g, params, input, starts, ends);
+}
+
 NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
 {
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
+
+NodeID GraphBuilder::add_upsample_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D info, InterpolationPolicy upsampling_policy)
+{
+    return create_simple_single_input_output_node<UpsampleLayerNode>(g, params, input, info, upsampling_policy);
+}
+
+NodeID GraphBuilder::add_yolo_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info, int32_t num_classes)
+{
+    return create_simple_single_input_output_node<YOLOLayerNode>(g, params, input, act_info, num_classes);
+}
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 5f33ed3..037b40b 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/graph.h"
 #include "arm_compute/graph/Utils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 
 namespace arm_compute
 {
@@ -75,17 +76,20 @@
 
 void GraphContext::finalize()
 {
+    const size_t num_pools = 1;
     for(auto &mm_obj : _memory_managers)
     {
+        ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator);
+
         // Finalize intra layer memory manager
         if(mm_obj.second.intra_mm != nullptr)
         {
-            mm_obj.second.intra_mm->finalize();
+            mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools);
         }
         // Finalize cross layer memory manager
         if(mm_obj.second.cross_mm != nullptr)
         {
-            mm_obj.second.cross_mm->finalize();
+            mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools);
         }
     }
 }

diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index f9d13ac..57c5f9d 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp

@@ -55,6 +55,7 @@
     }
 
     // Force target to all graph construct
+    // TODO (geopin01) : Support heterogeneous execution
     Target forced_target = target;
     if(!is_target_supported(target))
     {
@@ -101,7 +102,7 @@
 
     // Register graph
     _workloads.insert(std::make_pair(graph.id(), std::move(workload)));
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id() << std::endl);
 }
 
 void GraphManager::execute_graph(Graph &graph)
@@ -137,4 +138,4 @@
     _workloads.erase(it);
 }
 } // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index 8ed68bd..92860e2 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp

@@ -44,9 +44,9 @@
     return (index >= _passes.size()) ? nullptr : _passes.at(index).get();
 }
 
-void PassManager::append(std::unique_ptr<IGraphMutator> pass)
+void PassManager::append(std::unique_ptr<IGraphMutator> pass, bool conditional)
 {
-    if(pass)
+    if(pass && conditional)
     {
         ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
         _passes.push_back(std::move(pass));

diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index 0a85a7f..71ec548 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp

@@ -78,20 +78,16 @@
 {
     PassManager pm;
 
+    const bool is_target_gc = target == Target::GC;
+
     // Passes that mutate graph IR
+    pm.append(support::cpp14::make_unique<NodeFusionMutator>(), !is_target_gc);
     pm.append(support::cpp14::make_unique<GroupedConvolutionMutator>());
-    if(target != Target::GC)
-    {
-        pm.append(support::cpp14::make_unique<NodeFusionMutator>());
-        pm.append(support::cpp14::make_unique<InPlaceOperationMutator>());
-    }
+    pm.append(support::cpp14::make_unique<InPlaceOperationMutator>(), !is_target_gc);
 
     // Passes that mutate backend information
-    if(target != Target::GC)
-    {
-        pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>());
-        pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>());
-    }
+    pm.append(support::cpp14::make_unique<DepthConcatSubTensorMutator>(), !is_target_gc);
+    pm.append(support::cpp14::make_unique<SplitLayerSubTensorMutator>(), !is_target_gc);
     pm.append(support::cpp14::make_unique<NodeExecutionMethodMutator>());
 
     return pm;

diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 1dbeae9..ae7f0a5 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp

@@ -69,6 +69,7 @@
 
 CLDeviceBackend::~CLDeviceBackend()
 {
+    // TODO (geopin01) : Shouldn't call non exception safe stuff here
     if(_tuner.tune_new_kernels() && !_tuner.lws_table().empty() && !_tuner_file.empty())
     {
         _tuner.save_to_file(_tuner_file);
@@ -126,6 +127,7 @@
         mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
         mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
         mm_ctx.cross_group = std::make_shared<CLMemoryGroup>(mm_ctx.cross_mm);
+        mm_ctx.allocator   = _allocator.get();
 
         ctx.insert_memory_management_ctx(std::move(mm_ctx));
     }
@@ -194,8 +196,6 @@
     auto pool_mgr     = std::make_shared<PoolManager>();
     auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 
-    mm->set_allocator(_allocator.get());
-
     return mm;
 }
 } // namespace backends

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index bf3dcba..c37a137 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -83,6 +83,8 @@
             return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
             return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+        case NodeType::BoundingBoxTransformLayer:
+            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
             return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
@@ -99,22 +101,40 @@
             return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
             return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+        case NodeType::GenerateProposalsLayer:
+            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+        case NodeType::NormalizePlanarYUVLayer:
+            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+        case NodeType::PadLayer:
+            return detail::create_pad_layer<CLPadLayer, CLTargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
             return detail::create_permute_layer<CLPermute, CLTargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
             return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+        case NodeType::PriorBoxLayer:
+            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+        case NodeType::ReorgLayer:
+            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
             return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<CLScale, CLTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
+        case NodeType::ROIAlignLayer:
+            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+        case NodeType::SliceLayer:
+            return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
             return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        case NodeType::UpsampleLayer:
+            return detail::create_upsample_layer<CLUpsampleLayer, CLTargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
+        case NodeType::YOLOLayer:
+            return detail::create_yolo_layer<CLYOLOLayer, CLTargetInfo>(*polymorphic_downcast<YOLOLayerNode *>(node), ctx);
         default:
             return nullptr;
     }
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index ba5b59d..a070973 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -47,6 +47,8 @@
     NodeType type = node->type();
     switch(type)
     {
+        case NodeType::BoundingBoxTransformLayer:
+            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
             return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
@@ -57,12 +59,30 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
                    CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::GenerateProposalsLayer:
+            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+        case NodeType::NormalizePlanarYUVLayer:
+            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+        case NodeType::PadLayer:
+            return detail::validate_pad_layer<CLPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
             return detail::validate_permute_layer<CLPermute>(*polymorphic_downcast<PermuteLayerNode *>(node));
+        case NodeType::PriorBoxLayer:
+            return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+        case NodeType::ReorgLayer:
+            return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+        case NodeType::ROIAlignLayer:
+            return detail::validate_roi_align_layer<CLROIAlignLayer>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+        case NodeType::SliceLayer:
+            return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
+        case NodeType::UpsampleLayer:
+            return detail::validate_upsample_layer<CLUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
+        case NodeType::YOLOLayer:
+            return detail::validate_yolo_layer<CLYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
         default:
             return Status{};
     }
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index fdb044c..219d9d0 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp

@@ -69,6 +69,7 @@
 
 void CLTensorHandle::release_if_unused()
 {
+    // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
     if(!_tensor.is_used())
     {
         _tensor.allocator()->free();
@@ -101,4 +102,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
index ec3cf4f..5f0bf3f 100644
--- a/src/graph/backends/GLES/GCDeviceBackend.cpp
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp

@@ -86,6 +86,7 @@
         mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
         mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
         mm_ctx.cross_group = std::make_shared<GCMemoryGroup>(mm_ctx.cross_mm);
+        mm_ctx.allocator   = &_allocator;
 
         ctx.insert_memory_management_ctx(std::move(mm_ctx));
     }
@@ -151,8 +152,6 @@
     auto pool_mgr     = std::make_shared<PoolManager>();
     auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 
-    mm->set_allocator(&_allocator);
-
     return mm;
 }
 } // namespace backends

diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index f72513c..2ca453e 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp

@@ -94,7 +94,8 @@
     func->configure(inputs, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
                                << " Target " << GCTargetInfo::TargetType
                                << " Data Type: " << output->info()->data_type()
                                << " Shape: " << output->info()->tensor_shape()
@@ -120,8 +121,9 @@
         biases->info()->set_data_type(DataType::S32);
     }
 
-    const PadStrideInfo     conv_info      = node.convolution_info();
-    const ConvolutionMethod conv_algorithm = node.convolution_method();
+    const PadStrideInfo       conv_info      = node.convolution_info();
+    const ConvolutionMethod   conv_algorithm = node.convolution_method();
+    const ActivationLayerInfo fused_act      = node.fused_activation();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, GCTargetInfo::TargetType);
@@ -132,23 +134,26 @@
     {
         std::tie(func, func_name) = create_named_function<GCConvolutionLayerFunctions::DirectConvolutionLayer>(
                                         std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info);
+                                        input, weights, biases, output, conv_info, fused_act);
     }
     else
     {
         std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayerFunctions::GenericConvolutionLayer>(
                                         std::string("ConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info);
+                                        input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act);
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << func_name
                                << " Data Type: " << input->info()->data_type()
                                << " Input QuantInfo: " << input->info()->quantization_info()
                                << " Weights QuantInfo: " << weights->info()->quantization_info()
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;
 }
@@ -169,8 +174,10 @@
         biases->info()->set_data_type(DataType::S32);
     }
 
-    const PadStrideInfo              conv_info     = node.convolution_info();
-    const DepthwiseConvolutionMethod dwc_algorithm = node.depthwise_convolution_method();
+    const PadStrideInfo              conv_info        = node.convolution_info();
+    const DepthwiseConvolutionMethod dwc_algorithm    = node.depthwise_convolution_method();
+    const unsigned int               depth_multiplier = 1;
+    const ActivationLayerInfo        fused_act        = node.fused_activation();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::unique_ptr<IFunction> func;
@@ -179,7 +186,7 @@
     {
         std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
                                         std::string("DepthwiseConvolutionLayer3x3"),
-                                        input, weights, biases, output, conv_info);
+                                        input, weights, biases, output, conv_info, depth_multiplier, fused_act);
     }
     else
     {
@@ -187,7 +194,9 @@
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << func_name
                                << " Target " << GCTargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << " Input QuantInfo: " << input->info()->quantization_info()
@@ -195,6 +204,7 @@
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;
 }
@@ -241,11 +251,13 @@
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << GCTargetInfo::TargetType
-                               << " Operation " << func_name
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << GCTargetInfo::TargetType
+                               << " Operation: " << func_name
                                << " Data Type: " << input1->info()->data_type()
-                               << " Shape : " << input1->info()->tensor_shape()
+                               << " Shape: " << input1->info()->tensor_shape()
                                << std::endl);
 
     return func;
@@ -278,6 +290,8 @@
             return detail::create_fully_connected_layer<GCFullyConnectedLayer, GCTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<GCNormalizationLayer, GCTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+        case NodeType::NormalizePlanarYUVLayer:
+            return detail::create_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer, GCTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PoolingLayer:
             return detail::create_pooling_layer<GCPoolingLayer, GCTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::ResizeLayer:
@@ -290,4 +304,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 53049c7..fe69c7a 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp

@@ -55,6 +55,7 @@
     arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
     ARM_COMPUTE_ERROR_ON(weights == nullptr);
 
+    // TODO (geopin01) : Switch when validation is implemented
     // Validate function
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
     node.set_depthwise_convolution_method(DepthwiseConvolutionMethod::Optimized3x3);
@@ -102,6 +103,8 @@
     NodeType type = node->type();
     switch(type)
     {
+        case NodeType::BoundingBoxTransformLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ChannelShuffleLayer");
         case NodeType::ConvolutionLayer:
@@ -110,10 +113,28 @@
             return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::FlattenLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
+        case NodeType::GenerateProposalsLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+        case NodeType::NormalizePlanarYUVLayer:
+            return detail::validate_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+        case NodeType::PadLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PadLayer");
         case NodeType::PermuteLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PermuteLayer");
+        case NodeType::PriorBoxLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PriorBoxLayer");
+        case NodeType::ReorgLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReorgLayer");
         case NodeType::ReshapeLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReshapeLayer");
+        case NodeType::ROIAlignLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+        case NodeType::SliceLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
+        case NodeType::UpsampleLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : UpsampleLayer");
+        case NodeType::YOLOLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : YOLOLayer");
         default:
             return Status{};
     }

diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
index 6f96263..4e5c652 100644
--- a/src/graph/backends/GLES/GCTensorHandle.cpp
+++ b/src/graph/backends/GLES/GCTensorHandle.cpp

@@ -69,6 +69,7 @@
 
 void GCTensorHandle::release_if_unused()
 {
+    // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
     if(!_tensor.is_used())
     {
         _tensor.allocator()->free();
@@ -101,4 +102,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 5fc44d0..23ced2f 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp

@@ -86,6 +86,7 @@
         mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Offset);
         mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Offset);
         mm_ctx.cross_group = std::make_shared<MemoryGroup>(mm_ctx.cross_mm);
+        mm_ctx.allocator   = &_allocator;
 
         ctx.insert_memory_management_ctx(std::move(mm_ctx));
     }
@@ -156,8 +157,6 @@
     auto pool_mgr = std::make_shared<PoolManager>();
     auto mm       = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
 
-    mm->set_allocator(&_allocator);
-
     return mm;
 }
 } // namespace backends

diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 36a25ad..ca8d485 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp

@@ -90,13 +90,16 @@
     NETargetInfo::TensorType *biases  = get_backing_tensor<NETargetInfo>(node.input(2));
     NETargetInfo::TensorType *output  = get_backing_tensor<NETargetInfo>(node.output(0));
 
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+
+    if(is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
 
-    const PadStrideInfo     conv_info      = node.convolution_info();
-    const ConvolutionMethod conv_algorithm = node.convolution_method();
+    const PadStrideInfo       conv_info      = node.convolution_info();
+    const ConvolutionMethod   conv_algorithm = node.convolution_method();
+    const ActivationLayerInfo fused_act      = node.fused_activation();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
@@ -105,33 +108,40 @@
     if(conv_algorithm == ConvolutionMethod::Direct)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
     }
     else if(conv_algorithm == ConvolutionMethod::GEMM)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(
-                                        std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
     }
     else if(conv_algorithm == ConvolutionMethod::Winograd)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(
-                                        std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
     }
     else
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEConvolutionLayer>(
-                                        std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
     }
 
     // Log info
+    std::ostringstream qss;
+    if(is_quantized)
+    {
+        qss << " Input QuantInfo: " << input->info()->quantization_info()
+            << " Weights QuantInfo: " << weights->info()->quantization_info()
+            << " Output QuantInfo: " << output->info()->quantization_info();
+    }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << func_name
                                << " Target " << NETargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
-                               << " Input QuantInfo: " << input->info()->quantization_info()
-                               << " Weights QuantInfo: " << weights->info()->quantization_info()
+                               << qss.str()
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;
 }
@@ -153,8 +163,10 @@
     func->configure(input, output, norm_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << NETargetInfo::TargetType
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << NETargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
@@ -179,6 +191,8 @@
             return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
             return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+        case NodeType::ChannelShuffleLayer:
+            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
             return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
@@ -199,16 +213,24 @@
             return detail::create_permute_layer<NEPermute, NETargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
             return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+        case NodeType::PriorBoxLayer:
+            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+        case NodeType::ReorgLayer:
+            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
             return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::SoftmaxLayer:
             return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+        case NodeType::UpsampleLayer:
+            return detail::create_upsample_layer<NEUpsampleLayer, NETargetInfo>(*polymorphic_downcast<UpsampleLayerNode *>(node), ctx);
+        case NodeType::YOLOLayer:
+            return detail::create_yolo_layer<NEYOLOLayer, NETargetInfo>(*polymorphic_downcast<YOLOLayerNode *>(node), ctx);
         default:
             return nullptr;
     }
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 58ffaf0..a2abc83 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp

@@ -47,8 +47,10 @@
     NodeType type = node->type();
     switch(type)
     {
+        case NodeType::BoundingBoxTransformLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ChannelShuffleLayer");
+            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
             return detail::validate_convolution_layer<NEConvolutionLayer,
                    NEDirectConvolutionLayer,
@@ -57,12 +59,30 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
                    NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::GenerateProposalsLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+        case NodeType::NormalizePlanarYUVLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
+        case NodeType::PadLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PadLayer");
         case NodeType::PermuteLayer:
             return detail::validate_permute_layer<NEPermute>(*polymorphic_downcast<PermuteLayerNode *>(node));
+        case NodeType::PriorBoxLayer:
+            return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+        case NodeType::ReorgLayer:
+            return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
+        case NodeType::ROIAlignLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+        case NodeType::SliceLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
+        case NodeType::UpsampleLayer:
+            return detail::validate_upsample_layer<NEUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
+        case NodeType::YOLOLayer:
+            return detail::validate_yolo_layer<NEYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
         default:
             return Status{};
     }
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index caa2c10..5892116 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp

@@ -68,6 +68,7 @@
 
 void NETensorHandle::release_if_unused()
 {
+    // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
     if(!_tensor.is_used())
     {
         _tensor.allocator()->free();
@@ -100,4 +101,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index 6b2f68c..7fc5ca0 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp

@@ -87,6 +87,7 @@
         // If its a const node:
         if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
         {
+            // TODO (geopin01) : Create IO iterator wrappers
             // Add all its inputs / outputs to the list of constant handles
             for(unsigned int i = 0; i < node->num_inputs(); ++i)
             {

diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index f479963..f2c381b 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp

@@ -254,7 +254,8 @@
     bool is_valid = true;
     std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor)
     {
-        is_valid = is_valid && (output_tensor != nullptr) && output_tensor->call_accessor();
+        bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
+        is_valid          = is_valid && valid_output;
     });
 
     return is_valid;

diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index 0d65d6a..d69d2cd 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp

@@ -41,7 +41,7 @@
 namespace
 {
 NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
-                                  PadStrideInfo conv_info, ConvolutionMethod method, FastMathHint fast_math_hint, unsigned int num_groups)
+                                  PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
 {
     bool has_bias = (bias != EmptyNodeID);
 
@@ -86,6 +86,10 @@
         ARM_COMPUTE_ERROR_ON(node == nullptr);
         node->set_common_node_parameters(group_params);
 
+        // Down-cast node
+        auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
+        conv_node->set_fused_activation(fused_act);
+
         convolution_outputs.push_back({ conv_nid, 0 });
     }
 
@@ -127,17 +131,20 @@
                 auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
 
                 // Get internal convolution info
-                const PadStrideInfo     conv_info       = conv_node->convolution_info();
-                const ConvolutionMethod conv_method     = conv_node->convolution_method();
-                const FastMathHint      fast_math_hint  = conv_node->fast_math_hint();
-                const unsigned int      num_groups      = conv_node->num_groups();
-                const NodeParams        params          = conv_node->common_node_params();
-                const Target            assigned_target = conv_node->assigned_target();
+                // TODO (geopin01) : Create a descriptor or a clone interface
+                const PadStrideInfo       conv_info       = conv_node->convolution_info();
+                const ConvolutionMethod   conv_method     = conv_node->convolution_method();
+                const ActivationLayerInfo fused_act_info  = conv_node->fused_activation();
+                const FastMathHint        fast_math_hint  = conv_node->fast_math_hint();
+                const unsigned int        num_groups      = conv_node->num_groups();
+                const NodeParams          params          = conv_node->common_node_params();
+                const Target              assigned_target = conv_node->assigned_target();
 
                 // Extract node ids
-                const NodeID input_id   = conv_node->input_id(0);
-                const NodeID weights_id = conv_node->input_id(1);
-                const NodeID bias_id    = conv_node->input_id(2);
+                ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
+                const NodeID input_id   = conv_node->input_edge(0)->producer()->id();
+                const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
+                const NodeID bias_id    = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
 
                 // Get driving nodes
                 std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -151,7 +158,7 @@
 
                 // Create grouped convolution node
                 NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
-                                                                    conv_info, conv_method, fast_math_hint, num_groups);
+                                                                    conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
 
                 // Remove convolution node
                 g.remove_node(node->id());

diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 82bfe25..9dc02d1 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp

@@ -38,44 +38,49 @@
 {
 namespace detail
 {
-void fuse_batch_norm_with_activation(Graph &g)
+template <typename N>
+void fuse_node_with_activation(Graph                              &g,
+                               const std::set<Activation>         &supported_fused_activations,
+                               std::function<bool(INode &)> const &prec)
 {
-    // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
-
     // Not interested in the order of nodes
     for(auto &node : g.nodes())
     {
-        // Check if the node is batch norm and not a branching node
-        if(node && node->type() == NodeType::BatchNormalizationLayer && node->output_edges().size() == 1)
+        // Check if the node is of type N and not a branching node
+        if(node && node->type() == N::node_type && node->output_edges().size() == 1)
         {
             auto output_edge_id = *node->output_edges().begin();
             auto output_edge    = g.edge(output_edge_id);
             // Check if following node is an activation layer node
             if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
             {
-                auto *bn_node  = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->producer());
+                auto *n_node   = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
                 auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
 
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || bn_node->output(0) == nullptr);
+                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
+                // Check given precondition
+                if(!prec(*n_node))
+                {
+                    continue;
+                }
                 // Check if activation is supported for fusion
                 if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
                 {
                     continue;
                 }
 
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing Batch Normalization node with ID : " << output_edge->producer_id()
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
                                               << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
 
-                // Prevent fusion if batch normalization node has an output accessor
-                if(bn_node->output(0)->accessor() == nullptr)
+                // Prevent fusion if fused node has an output accessor
+                if(n_node->output(0)->accessor() == nullptr)
                 {
                     // Get driving nodes of activation node
                     std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
 
-                    // Set activation info to batch normalization
-                    bn_node->set_fused_activation(act_node->activation_info());
+                    // Set activation info to fused node
+                    n_node->set_fused_activation(act_node->activation_info());
 
                     // Extract activation node accessor if any
                     auto act_node_accessor = act_node->output(0)->extract_accessor();
@@ -83,18 +88,18 @@
                     // Remove activation node
                     g.remove_node(act_node->id());
 
-                    // Update batch normalization node outputs
+                    // Update fused node outputs
                     for(auto &driving_node : act_driving_nodes)
                     {
-                        g.add_connection(bn_node->id(), 0, driving_node.node_id, driving_node.index);
+                        g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
                     }
 
-                    // Update accessor to batch normalization node
-                    bn_node->output(0)->set_accessor(std::move(act_node_accessor));
+                    // Update accessor to fused node
+                    n_node->output(0)->set_accessor(std::move(act_node_accessor));
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion as batch normalization node has an output accessor\n");
+                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
                 }
             }
         }
@@ -109,7 +114,24 @@
 
 void NodeFusionMutator::mutate(Graph &g)
 {
-    detail::fuse_batch_norm_with_activation(g);
+    // Supported activations when fusing
+    const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
+
+    // Preconditions
+    auto empty_prec = [](INode & n)
+    {
+        return true;
+    };
+    auto qs8_prec = [](INode & n)
+    {
+        ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
+        return n.output(0)->desc().data_type == DataType::QASYMM8;
+    };
+
+    // Fusion mutations
+    detail::fuse_node_with_activation<BatchNormalizationLayerNode>(g, supported_fused_activations, empty_prec);
+    detail::fuse_node_with_activation<ConvolutionLayerNode>(g, supported_fused_activations, empty_prec);
+    detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>(g, supported_fused_activations, qs8_prec);
 }
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index 3ae11fc..3d392bd 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp

@@ -78,7 +78,7 @@
 
 NodeType BatchNormalizationLayerNode::type() const
 {
-    return NodeType::BatchNormalizationLayer;
+    return BatchNormalizationLayerNode::node_type;
 }
 
 void BatchNormalizationLayerNode::accept(INodeVisitor &v)

diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
new file mode 100644
index 0000000..ad261e3
--- /dev/null
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info)
+    : _bbox_info(info)
+{
+    _input_edges.resize(2, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+const BoundingBoxTransformInfo &BoundingBoxTransformLayerNode::info() const
+{
+    return _bbox_info;
+}
+
+bool BoundingBoxTransformLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor BoundingBoxTransformLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *deltas = input(1);
+    ARM_COMPUTE_ERROR_ON(deltas == nullptr);
+
+    TensorDescriptor output_desc = deltas->desc();
+    return output_desc;
+}
+
+NodeType BoundingBoxTransformLayerNode::type() const
+{
+    return NodeType::BoundingBoxTransformLayer;
+}
+
+void BoundingBoxTransformLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index e9cb039..15c7ff6 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp

@@ -37,7 +37,7 @@
                                            ConvolutionMethod method,
                                            FastMathHint      fast_math_hint,
                                            QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info)
+    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info), _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -73,6 +73,16 @@
     return _num_groups;
 }
 
+ActivationLayerInfo ConvolutionLayerNode::fused_activation() const
+{
+    return _fused_activation;
+}
+
+void ConvolutionLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    _fused_activation = fused_activation;
+}
+
 TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                                  const TensorDescriptor &weights_descriptor,
                                                                  const PadStrideInfo    &info)
@@ -126,7 +136,7 @@
 
 NodeType ConvolutionLayerNode::type() const
 {
-    return NodeType::ConvolutionLayer;
+    return ConvolutionLayerNode::node_type;
 }
 
 void ConvolutionLayerNode::accept(INodeVisitor &v)

diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 9329ae3..e7ccffd 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp

@@ -51,8 +51,7 @@
 
 TensorDescriptor DeconvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                                    const TensorDescriptor &weights_descriptor,
-                                                                   const PadStrideInfo    &info,
-                                                                   const Size2D           &inner_border)
+                                                                   const PadStrideInfo    &info)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -65,7 +64,6 @@
     std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height,
                                                                             kernel_width, kernel_height,
                                                                             info.pad().first, info.pad().second,
-                                                                            inner_border.x(), inner_border.y(),
                                                                             info.stride().first, info.stride().second);
 
     TensorDescriptor output_descriptor = input_descriptor;
@@ -96,7 +94,7 @@
 
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
-    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _inner_border);
+    TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
     return output_info;
 }
 

diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 1a6f8d3..02d1632 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp

@@ -33,7 +33,7 @@
 namespace graph
 {
 DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, DepthwiseConvolutionMethod method)
-    : _info(std::move(info)), _method(method)
+    : _info(std::move(info)), _method(method), _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -54,6 +54,16 @@
     return _info;
 }
 
+ActivationLayerInfo DepthwiseConvolutionLayerNode::fused_activation() const
+{
+    return _fused_activation;
+}
+
+void DepthwiseConvolutionLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    _fused_activation = fused_activation;
+}
+
 TensorDescriptor DepthwiseConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                                           const TensorDescriptor &weights_descriptor,
                                                                           const PadStrideInfo    &info)
@@ -100,7 +110,7 @@
 
 NodeType DepthwiseConvolutionLayerNode::type() const
 {
-    return NodeType::DepthwiseConvolutionLayer;
+    return DepthwiseConvolutionLayerNode::node_type;
 }
 
 void DepthwiseConvolutionLayerNode::accept(INodeVisitor &v)

diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index 78b45dc..baae555 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp

@@ -57,7 +57,7 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_desc = src->desc();
-    output_desc.shape.collapse(src->desc().shape.num_dimensions());
+    output_desc.shape.collapse(3);
 
     return output_desc;
 }

diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
new file mode 100644
index 0000000..7367e80
--- /dev/null
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp

@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info)
+    : _info(info)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(3, NullTensorID);
+}
+
+const GenerateProposalsInfo &GenerateProposalsLayerNode::info() const
+{
+    return _info;
+}
+
+bool GenerateProposalsLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
+       && (output_id(2) != NullTensorID))
+    {
+        for(unsigned int i = 0; i < 3; ++i)
+        {
+            Tensor *dst = output(i);
+            ARM_COMPUTE_ERROR_ON(dst == nullptr);
+            dst->desc() = configure_output(i);
+        }
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor GenerateProposalsLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_ERROR_ON(idx > 3);
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+    TensorDescriptor output_desc = src->desc();
+
+    switch(idx)
+    {
+        case 0:
+            // Configure proposals output
+            output_desc.shape = TensorShape(5, src->desc().shape.total_size());
+            break;
+        case 1:
+            // Configure scores_out output
+            output_desc.shape = TensorShape(src->desc().shape.total_size());
+            break;
+        case 2:
+            // Configure num_valid_proposals
+            output_desc.shape     = TensorShape(1);
+            output_desc.data_type = DataType::U32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported output index");
+    }
+    return output_desc;
+}
+
+NodeType GenerateProposalsLayerNode::type() const
+{
+    return NodeType::GenerateProposalsLayer;
+}
+
+void GenerateProposalsLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
new file mode 100644
index 0000000..129b380
--- /dev/null
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+NormalizePlanarYUVLayerNode::NormalizePlanarYUVLayerNode()
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+bool NormalizePlanarYUVLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor NormalizePlanarYUVLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType NormalizePlanarYUVLayerNode::type() const
+{
+    return NodeType::NormalizePlanarYUVLayer;
+}
+
+void NormalizePlanarYUVLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
new file mode 100644
index 0000000..e7996d2
--- /dev/null
+++ b/src/graph/nodes/PadLayerNode.cpp

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PadLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PadLayerNode::PadLayerNode(PaddingList &padding)
+    : _padding(padding)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+const PaddingList &PadLayerNode::padding() const
+{
+    return _padding;
+}
+
+bool PadLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor PadLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    TensorDescriptor  output_desc = src->desc();
+    const TensorShape input_shape = src->desc().shape;
+    for(size_t dim = 0; dim < _padding.size(); ++dim)
+    {
+        output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second);
+    }
+
+    return output_desc;
+}
+
+NodeType PadLayerNode::type() const
+{
+    return NodeType::PadLayer;
+}
+
+void PadLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
new file mode 100644
index 0000000..edb1fba
--- /dev/null
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info)
+    : _info(std::move(prior_info))
+{
+    _input_edges.resize(2, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+PriorBoxLayerInfo PriorBoxLayerNode::priorbox_info() const
+{
+    return _info;
+}
+
+TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                              const PriorBoxLayerInfo &info)
+{
+    const unsigned int layer_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int layer_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int num_priors   = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(0, layer_width * layer_height * num_priors * 4);
+    output_descriptor.shape.set(1, 2);
+    output_descriptor.shape.set(2, 1);
+
+    return output_descriptor;
+}
+
+bool PriorBoxLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor PriorBoxLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *input0 = input(0);
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+
+    return compute_output_descriptor(input0->desc(), _info);
+}
+
+NodeType PriorBoxLayerNode::type() const
+{
+    return NodeType::PriorBoxLayer;
+}
+
+void PriorBoxLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
new file mode 100644
index 0000000..5e89ef2
--- /dev/null
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info)
+    : _pool_info(pool_info)
+{
+    _input_edges.resize(2, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+const ROIPoolingLayerInfo &ROIAlignLayerNode::pooling_info() const
+{
+    return _pool_info;
+}
+
+bool ROIAlignLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ROIAlignLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src  = input(0);
+    const Tensor *rois = input(1);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+    ARM_COMPUTE_ERROR_ON(rois == nullptr);
+
+    TensorDescriptor output_desc = src->desc();
+
+    const size_t idx_n = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::BATCHES);
+    const size_t idx_c = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::CHANNEL);
+    const size_t idx_h = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_w = get_data_layout_dimension_index(output_desc.layout, DataLayoutDimension::WIDTH);
+
+    output_desc.shape.set(idx_n, rois->desc().shape[1]);
+    output_desc.shape.set(idx_c, src->desc().shape[idx_c]);
+    output_desc.shape.set(idx_h, _pool_info.pooled_height());
+    output_desc.shape.set(idx_w, _pool_info.pooled_width());
+
+    return output_desc;
+}
+
+NodeType ROIAlignLayerNode::type() const
+{
+    return NodeType::ROIAlignLayer;
+}
+
+void ROIAlignLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
new file mode 100644
index 0000000..6b83f6b
--- /dev/null
+++ b/src/graph/nodes/ReorgLayerNode.cpp

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ReorgLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+ReorgLayerNode::ReorgLayerNode(int stride)
+    : _stride(stride)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+int ReorgLayerNode::stride() const
+{
+    return _stride;
+}
+
+TensorDescriptor ReorgLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int stride)
+{
+    const unsigned int input_width   = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height  = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+    const unsigned int input_channel = get_dimension_size(input_descriptor, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_ERROR_ON(stride <= 0);
+    ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width / stride);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height / stride);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+
+    return output_descriptor;
+}
+
+bool ReorgLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor ReorgLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _stride);
+}
+
+NodeType ReorgLayerNode::type() const
+{
+    return NodeType::ReorgLayer;
+}
+
+void ReorgLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
new file mode 100644
index 0000000..3a29e4c
--- /dev/null
+++ b/src/graph/nodes/SliceLayerNode.cpp

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/SliceLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+SliceLayerNode::SliceLayerNode(Coordinates &starts, Coordinates &ends)
+    : _starts(starts), _ends(ends)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+Coordinates SliceLayerNode::starts() const
+{
+    return _starts;
+}
+
+Coordinates SliceLayerNode::ends() const
+{
+    return _ends;
+}
+
+TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                           const Coordinates &starts, const Coordinates &ends)
+{
+    // Get absolute end coordinates
+    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input_descriptor.shape, ends);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    for(unsigned int i = 0; i < starts.num_dimensions(); ++i)
+    {
+        output_descriptor.shape.set(i, ends_abs[i] - starts[i]);
+    }
+
+    return output_descriptor;
+}
+
+bool SliceLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor SliceLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _starts, _ends);
+}
+
+NodeType SliceLayerNode::type() const
+{
+    return NodeType::SliceLayer;
+}
+
+void SliceLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute

diff --git a/src/graph/nodes/UpsampleLayerNode.cpp b/src/graph/nodes/UpsampleLayerNode.cpp
new file mode 100644
index 0000000..bdd39e8
--- /dev/null
+++ b/src/graph/nodes/UpsampleLayerNode.cpp

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/UpsampleLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+UpsampleLayerNode::UpsampleLayerNode(Size2D info, InterpolationPolicy upsampling_policy)
+    : _info(info), _upsampling_policy(upsampling_policy)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+Size2D UpsampleLayerNode::info() const
+{
+    return _info;
+}
+
+InterpolationPolicy UpsampleLayerNode::upsampling_policy() const
+{
+    return _upsampling_policy;
+}
+
+TensorDescriptor UpsampleLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                              Size2D                  info)
+{
+    const unsigned int input_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
+    const unsigned int input_height = get_dimension_size(input_descriptor, DataLayoutDimension::HEIGHT);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::WIDTH), input_width * info.x());
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor, DataLayoutDimension::HEIGHT), input_height * info.y());
+
+    return output_descriptor;
+}
+
+bool UpsampleLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor UpsampleLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return compute_output_descriptor(src->desc(), _info);
+}
+
+NodeType UpsampleLayerNode::type() const
+{
+    return NodeType::UpsampleLayer;
+}
+
+void UpsampleLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/graph/nodes/YOLOLayerNode.cpp b/src/graph/nodes/YOLOLayerNode.cpp
new file mode 100644
index 0000000..cf1e576
--- /dev/null
+++ b/src/graph/nodes/YOLOLayerNode.cpp

@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/YOLOLayerNode.h"
+
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+YOLOLayerNode::YOLOLayerNode(ActivationLayerInfo act_info, int32_t num_classes)
+    : _act_info(act_info), _num_classes(num_classes)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+ActivationLayerInfo YOLOLayerNode::activation_info() const
+{
+    return _act_info;
+}
+
+int32_t YOLOLayerNode::num_classes() const
+{
+    return _num_classes;
+}
+
+bool YOLOLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor YOLOLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    return src->desc();
+}
+
+NodeType YOLOLayerNode::type() const
+{
+    return NodeType::YOLOLayer;
+}
+
+void YOLOLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 29505e5..e09451c 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        *handle.first = _blobs[handle.second];
+        handle.first->set_region(_blobs[handle.second].get());
     }
 }
 
@@ -61,7 +61,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        *handle.first = nullptr;
+        handle.first->set_region(nullptr);
     }
 }
 
@@ -82,17 +82,11 @@
 
     for(const auto &size : sizes)
     {
-        _blobs.push_back(_allocator->allocate(size, 0));
+        _blobs.push_back(_allocator->make_region(size, 0));
     }
 }
 
 void BlobMemoryPool::free_blobs()
 {
-    ARM_COMPUTE_ERROR_ON(!_allocator);
-
-    for(auto &blob : _blobs)
-    {
-        _allocator->free(blob);
-    }
     _blobs.clear();
 }
\ No newline at end of file

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index bbc513d..5bea85c 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp

@@ -24,23 +24,20 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 
 namespace arm_compute
 {
 CLMemory::CLMemory()
     : _region(nullptr), _region_owned(nullptr)
 {
-    create_empty_region();
 }
 
 CLMemory::CLMemory(std::shared_ptr<ICLMemoryRegion> memory)
     : _region(nullptr), _region_owned(std::move(memory))
 {
-    if(_region_owned == nullptr)
-    {
-        create_empty_region();
-    }
-    _region = _region_owned.get();
+    _region_owned = memory;
+    _region       = _region_owned.get();
 }
 
 CLMemory::CLMemory(ICLMemoryRegion *memory)
@@ -49,19 +46,36 @@
     _region = memory;
 }
 
-ICLMemoryRegion *CLMemory::region()
+ICLMemoryRegion *CLMemory::cl_region()
 {
     return _region;
 }
 
-ICLMemoryRegion *CLMemory::region() const
+ICLMemoryRegion *CLMemory::cl_region() const
 {
     return _region;
 }
 
-void CLMemory::create_empty_region()
+IMemoryRegion *CLMemory::region()
 {
-    _region_owned = std::make_shared<CLBufferMemoryRegion>(cl::Context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0);
+    return _region;
+}
+
+IMemoryRegion *CLMemory::region() const
+{
+    return _region;
+}
+
+void CLMemory::set_region(IMemoryRegion *region)
+{
+    auto cl_region = utils::cast::polymorphic_downcast<ICLMemoryRegion *>(region);
+    _region_owned  = nullptr;
+    _region        = cl_region;
+}
+
+void CLMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+    _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
     _region       = _region_owned.get();
 }
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 15fd7f3..9578d73 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp

@@ -48,9 +48,10 @@
     return _mapping;
 }
 
-void **ICLMemoryRegion::handle()
+std::unique_ptr<IMemoryRegion> ICLMemoryRegion::extract_subregion(size_t offset, size_t size)
 {
-    return reinterpret_cast<void **>(&_mem);
+    ARM_COMPUTE_UNUSED(offset, size);
+    return nullptr;
 }
 
 CLBufferMemoryRegion::CLBufferMemoryRegion(cl::Context ctx, cl_mem_flags flags, size_t size)
@@ -62,6 +63,12 @@
     }
 }
 
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
+    : ICLMemoryRegion(buffer.getInfo<CL_MEM_CONTEXT>(), buffer.getInfo<CL_MEM_SIZE>())
+{
+    _mem = buffer;
+}
+
 void *CLBufferMemoryRegion::ptr()
 {
     return nullptr;

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index dd716f7..0307498 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -28,86 +28,87 @@
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+const cl::Buffer CLTensorAllocator::_empty_buffer = cl::Buffer();
 
 namespace
 {
-std::shared_ptr<arm_compute::ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
+std::unique_ptr<ICLMemoryRegion> allocate_region(cl::Context context, size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::shared_ptr<ICLMemoryRegion> region = std::make_shared<CLFineSVMMemoryRegion>(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
+    std::unique_ptr<ICLMemoryRegion> region = support::cpp14::make_unique<CLFineSVMMemoryRegion>(context,
+                                                                                                 CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+                                                                                                 size,
+                                                                                                 alignment);
 
     // Try coarse-grain SVM in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = std::make_shared<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
+        region = support::cpp14::make_unique<CLCoarseSVMMemoryRegion>(context, CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
     if(region != nullptr && region->ptr() == nullptr)
     {
-        region = std::make_shared<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
+        region = support::cpp14::make_unique<CLBufferMemoryRegion>(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
     return region;
 }
 } // namespace
 
 CLTensorAllocator::CLTensorAllocator(CLTensor *owner)
-    : _associated_memory_group(nullptr), _memory(), _owner(owner)
+    : _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _owner(owner)
 {
 }
 
 uint8_t *CLTensorAllocator::data()
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+    return _mapping;
 }
 
 const cl::Buffer &CLTensorAllocator::cl_data() const
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    return _memory.region()->cl_data();
+    return _memory.region() == nullptr ? _empty_buffer : _memory.cl_region()->cl_data();
 }
 
 void CLTensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
     if(_associated_memory_group == nullptr)
     {
-        if(_memory.region()->cl_data().get() != nullptr)
+        if(_memory.region() != nullptr && _memory.cl_region()->cl_data().get() != nullptr)
         {
             // Memory is already allocated. Reuse it if big enough, otherwise fire an assertion
-            ARM_COMPUTE_ERROR_ON_MSG(info().total_size() > _memory.region()->size(), "Reallocation of a bigger memory region is not allowed!");
+            ARM_COMPUTE_ERROR_ON_MSG(info().total_size() > _memory.region()->size(),
+                                     "Reallocation of a bigger memory region is not allowed!");
         }
         else
         {
             // Perform memory allocation
-            _memory = CLMemory(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
+            _memory.set_owned_region(allocate_region(CLScheduler::get().context(), info().total_size(), 0));
         }
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, _memory.region()->handle(), info().total_size());
-        _memory.region()->set_size(info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
     }
     info().set_is_resizable(false);
 }
 
 void CLTensorAllocator::free()
 {
-    if(_associated_memory_group == nullptr)
-    {
-        _memory = CLMemory();
-        info().set_is_resizable(true);
-    }
+    _mapping = nullptr;
+    _memory.set_region(nullptr);
+    info().set_is_resizable(true);
 }
 
-arm_compute::Status CLTensorAllocator::import_memory(CLMemory memory)
+arm_compute::Status CLTensorAllocator::import_memory(cl::Buffer buffer)
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->cl_data().get() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_SIZE>() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(buffer.getInfo<CL_MEM_CONTEXT>().get() != CLScheduler::get().context().get());
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
-    _memory = memory;
+
+    _memory.set_owned_region(support::cpp14::make_unique<CLBufferMemoryRegion>(buffer));
     info().set_is_resizable(false);
 
     return Status{};
@@ -115,11 +116,10 @@
 
 void CLTensorAllocator::set_associated_memory_group(CLMemoryGroup *associated_memory_group)
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region()->cl_data().get() != nullptr);
-    _memory                  = CLMemory(std::make_shared<CLBufferMemoryRegion>(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, 0));
+    ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.cl_region()->cl_data().get() != nullptr);
+
     _associated_memory_group = associated_memory_group;
 }
 
@@ -136,16 +136,23 @@
 
 uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking)
 {
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
-    _memory.region()->map(q, blocking);
-    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+
+    _mapping = reinterpret_cast<uint8_t *>(_memory.cl_region()->map(q, blocking));
+    return _mapping;
 }
 
 void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping)
 {
-    ARM_COMPUTE_UNUSED(mapping);
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    ARM_COMPUTE_ERROR_ON(_mapping != mapping);
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() == nullptr);
-    _memory.region()->unmap(q);
+    ARM_COMPUTE_UNUSED(mapping);
+
+    _memory.cl_region()->unmap(q);
+    _mapping = nullptr;
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
index 5fca30c..e661f6a 100644
--- a/src/runtime/CL/functions/CLArithmeticSubtraction.cpp
+++ b/src/runtime/CL/functions/CLArithmeticSubtraction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
 #include "support/ToolchainSupport.h"
 
@@ -30,11 +31,21 @@
 
 using namespace arm_compute;
 
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
 
 Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)

diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
new file mode 100644
index 0000000..7919b13
--- /dev/null
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLBatchToSpaceLayer::CLBatchToSpaceLayer()
+    : _batch_to_space_kernel()
+{
+}
+
+void CLBatchToSpaceLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+{
+    _batch_to_space_kernel.configure(input, block_shape, output);
+}
+
+void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output)
+{
+    _batch_to_space_kernel.configure(input, block_shape_x, block_shape_y, output);
+}
+
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+{
+    return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
+}
+
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+{
+    return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+}
+
+void CLBatchToSpaceLayer::run()
+{
+    CLScheduler::get().enqueue(_batch_to_space_kernel, true);
+}

diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
new file mode 100644
index 0000000..46a6b8e
--- /dev/null
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
+
+#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+{
+    // Configure Bounding Box kernel
+    auto k = arm_compute::support::cpp14::make_unique<CLBoundingBoxTransformKernel>();
+    k->configure(boxes, pred_boxes, deltas, info);
+    _kernel = std::move(k);
+}
+
+Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+{
+    return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
new file mode 100644
index 0000000..409d3c9
--- /dev/null
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLComputeAllAnchors::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+{
+    // Configure ComputeAllAnchors kernel
+    auto k = arm_compute::support::cpp14::make_unique<CLComputeAllAnchorsKernel>();
+    k->configure(anchors, all_anchors, info);
+    _kernel = std::move(k);
+}
+
+Status CLComputeAllAnchors::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+{
+    return CLComputeAllAnchorsKernel::validate(anchors, all_anchors, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 40562b5..e07feb2 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -27,6 +27,8 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
 #include <memory>
 #include <tuple>
@@ -38,7 +40,10 @@
     : _memory_group(std::move(memory_manager)),
       _scale_f(),
       _conv_f(),
+      _flip_weights(),
       _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
       _is_prepared(false)
 {
 }
@@ -47,9 +52,17 @@
                                       unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+
+    const DataLayout data_layout = input->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(!info.padding_is_symmetric());
 
     const unsigned int stride_x = info.stride().first;
@@ -58,24 +71,34 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
-                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
 
     if(bias != nullptr)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        if(is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w], "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h], "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c], "Output's depth is invalid.");
 
-    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
-                                                                                                      info)));
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
@@ -84,7 +107,7 @@
     return Status{};
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
                                      unsigned int inner_border_right, unsigned int inner_border_top, const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -92,36 +115,46 @@
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
-                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+    const DataLayout data_layout = input->info()->data_layout();
 
-    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    _original_weights = weights;
+    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+    _flip_weights.configure(weights, &_weights_flipped);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
-    _is_prepared = false;
+    _is_prepared = weights_info.retain_internal_weights();
 
     _memory_group.manage(&_scaled_output);
 
-    // configure scale function
-    // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
-    TensorShape        scale_out_shape(input->info()->tensor_shape());
-    const unsigned int out_x = input->info()->dimension(0) + (input->info()->dimension(0) - 1) * (stride_x - 1) + inner_border_right + 2 * info.pad().first;
-    const unsigned int out_y = input->info()->dimension(1) + (input->info()->dimension(1) - 1) * (stride_y - 1) + inner_border_top + 2 * info.pad().second;
-    scale_out_shape.set(0, out_x);
-    scale_out_shape.set(1, out_y);
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type());
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    scale_out_info.set_data_layout(data_layout);
     _scaled_output.allocator()->init(scale_out_info);
 
-    _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), info);
+    // configure scale function
+    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+    _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), upsample_info);
 
     // setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info, weights_info);
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
     _scaled_output.allocator()->allocate();
 }
 
@@ -141,7 +174,25 @@
 {
     if(!_is_prepared)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
+        _weights_flipped.allocator()->allocate();
+        _weights_flipped.map(true);
+        _original_weights->map(CLScheduler::get().queue(), true);
+        CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
+        _weights_flipped.unmap();
+        _original_weights->unmap(CLScheduler::get().queue());
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
         _conv_f.prepare();
+
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
         _is_prepared = true;
     }
 }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 13a24f8..ce8667d 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,15 @@
 void CLDeconvolutionLayerUpsample::run()
 {
     _output->map(CLScheduler::get().queue(), true);
-    memset(_output->buffer(), 0, _output->info()->total_size());
+    if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
+    {
+        const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+        std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+    }
+    else
+    {
+        memset(_output->buffer(), 0, _output->info()->total_size());
+    }
     _output->unmap(CLScheduler::get().queue());
 
     CLScheduler::get().enqueue(_upsample, false);

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 76451af..497cdae 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -90,12 +90,13 @@
 }
 
 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
-    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
-      _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
+    : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _activationlayer_function(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(),
+      _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
 {
 }
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
@@ -188,10 +189,18 @@
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -238,6 +247,12 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
     }
 
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
@@ -253,6 +268,10 @@
     {
         CLScheduler::get().enqueue(_output_stage_kernel);
     }
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }
 
 void CLDepthwiseConvolutionLayer::prepare()

diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index 45f70d2..a0663b7 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp

@@ -68,7 +68,7 @@
     }
     else
     {
-        const float diff = image_size - 1;
+        const float diff = image_size - num_lowest_pixels;
 
         for(size_t i = 0; i < 256; ++i)
         {

diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 364db34..4137071 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,17 @@
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLFloor::configure(const ICLTensor *input, ICLTensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLFloorKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status CLFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLFloorKernel::validate(input, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 010985d..6a2aac6 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -49,6 +49,7 @@
         // Validate gemmlowp function
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
                                                                            &weights.clone()->set_quantization_info(weights_quantization_info),
+                                                                           nullptr,
                                                                            &output));
     }
     else
@@ -91,7 +92,7 @@
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
         // Configure gemmlowp function
-        _mm_gemmlowp.configure(input, weights, output);
+        _mm_gemmlowp.configure(input, weights, nullptr, output);
 
         // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -100,7 +101,7 @@
     else
     {
         // Configure matrix multiply kernel
-        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 1, false, retain_internal_weights));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights));
     }
 }
 

diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
new file mode 100644
index 0000000..32e4678
--- /dev/null
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLFuseBatchNormalization::CLFuseBatchNormalization()
+    : _fuse_bn_kernel()
+{
+}
+
+void CLFuseBatchNormalization::configure(const ICLTensor *conv_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
+                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
+                                         const ICLTensor *conv_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
+                                         float epsilon)
+{
+    _fuse_bn_kernel.configure(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+Status CLFuseBatchNormalization::validate(const ITensorInfo *conv_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
+                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
+                                          const ITensorInfo *conv_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
+                                          float epsilon)
+{
+    return CLFuseBatchNormalizationKernel::validate(conv_weights, bn_mean, bn_var, fused_weights, fused_bias, conv_bias, bn_beta, bn_gamma, epsilon);
+}
+
+void CLFuseBatchNormalization::run()
+{
+    CLScheduler::get().enqueue(_fuse_bn_kernel, true);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f16d1c0..baa0cf4 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -44,8 +44,9 @@
 {
     bool flag = true;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
+        // COMPMID-852
         if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
         {
             constexpr float alpha = 3.2f;
@@ -71,8 +72,18 @@
 } // namespace
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
-      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _interleave_kernel(),
+      _transpose_kernel(),
+      _mm_kernel(),
+      _ma_kernel(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _is_interleaved_transposed(false),
+      _run_addition(false),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false)
 {
 }
 
@@ -122,10 +133,7 @@
     if(_is_interleaved_transposed)
     {
         reinterpret_input_as_3d = false;
-    }
 
-    if(_is_interleaved_transposed)
-    {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
@@ -145,8 +153,10 @@
     }
 
     // Configure and tune matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d,
-                                                                                                        reinterpret_input_as_3d));
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+                                                                                                        mult_transpose1xW_width, mult_interleave4x4_height,
+                                                                                                        depth_output_gemm3d, reinterpret_input_as_3d),
+                         gemm_info.fp_mixed_precision());
     CLScheduler::get().tune_kernel_static(_mm_kernel);
 
     if(_is_interleaved_transposed)
@@ -227,7 +237,7 @@
     }
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, alpha, run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
 
     if(beta != 0 && c != nullptr)
     {

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 92d04d6..4694aa7 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -91,16 +91,21 @@
 }
 
 CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
-      _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false),
-      _skip_im2col(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _col2im_kernel(), _activationlayer_function(), _add_bias_kernel(),
+      _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), _skip_col2im(false), _is_quantized(false),
+      _is_activationlayer_enabled(false), _is_prepared(false)
 {
 }
 
-void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, int gemm_3d_depth)
+void CLGEMMConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                          int gemm_3d_depth)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), gemmlowp_output_stage, gemm_3d_depth, _skip_im2col));
+
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                         false, gemmlowp_output_stage);
 
     if(_is_quantized)
     {
@@ -112,7 +117,7 @@
         input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
-        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+        _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -121,16 +126,19 @@
     else
     {
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
-                                                                                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
     }
 }
 
-Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth, bool skip_im2col)
+Status CLGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+                                           const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+                                         false, gemmlowp_output_stage);
+
     if(is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -144,7 +152,7 @@
         weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
         // Perform validation step on GEMMLowp
-        return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+        return CLGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, gemm_info);
     }
     else
     {
@@ -177,21 +185,21 @@
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
 
-    _is_prepared      = weights_info.retain_internal_weights();
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout      = data_layout;
-    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !_is_quantized;
-    _append_bias      = (biases != nullptr) && (!_is_quantized);
+    _is_prepared                = weights_info.retain_internal_weights();
+    _original_weights           = weights;
+    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _data_layout                = data_layout;
+    _skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_col2im                = data_layout == DataLayout::NHWC;
+    _append_bias                = (biases != nullptr) && (!_is_quantized);
+    _is_activationlayer_enabled = act_info.enabled();
 
     // Set the GPU target for im2col and col2im
     _im2col_kernel.set_target(CLScheduler::get().target());
     _col2im_kernel.set_target(CLScheduler::get().target());
 
-    bool             is_nhwc                   = _data_layout == DataLayout::NHWC;
-    const ICLTensor *gemm_input_to_use         = input;
-    ICLTensor       *gemm_output_to_use        = output;
-    ICLTensor       *gemm_output_staged_to_use = output;
+    const ICLTensor *gemm_input_to_use  = input;
+    ICLTensor       *gemm_output_to_use = output;
 
     const ICLTensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
 
@@ -238,17 +246,18 @@
     }
 
     // Create GEMM output tensor
-    if(!is_nhwc || _is_quantized)
+    if(!_skip_col2im)
     {
-        // Calculate GEMM output shape
-        TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
+        TensorShape shape_gemm;
+
+        // If we cannot skip col2im it means we run im2col as well
+        shape_gemm = _im2col_output.info()->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
 
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-        const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
-        TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
-        info_gemm.set_quantization_info(output->info()->quantization_info());
+        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+        TensorInfo info_gemm(shape_gemm, 1, data_type);
+        info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
         _memory_group.manage(&_gemm_output);
 
@@ -256,56 +265,76 @@
         gemm_output_to_use = &_gemm_output;
     }
 
-    // Configure and tune GEMM
-    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1);
-
-    if(!_skip_im2col)
-    {
-        _im2col_output.allocator()->allocate();
-    }
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset     = 0;
+    gemmlowp_output_stage.gemmlowp_multiplier = 0;
+    gemmlowp_output_stage.gemmlowp_shift      = 0;
 
     // Configure output stage for quantized case
     if(_is_quantized)
     {
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        const float multiplier  = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
-        _memory_group.manage(&_tmp_output);
-        gemm_output_staged_to_use = &_tmp_output;
+        int min_activation = 0;
+        int max_activation = 0;
 
-        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+
+        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
+            _is_activationlayer_enabled = false;
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
     }
 
-    if(!is_nhwc || _is_quantized)
+    // Configure and tune GEMM
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth);
+
+    if(!_skip_im2col)
     {
-        if(input->info()->data_layout() == DataLayout::NCHW)
-        {
-            // Configure and tune Col2Im
-            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h), num_groups);
-            CLScheduler::get().tune_kernel_static(_col2im_kernel);
-        }
-        else
-        {
-            // Configure reshape layer
-            _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output);
-        }
+        _im2col_output.allocator()->allocate();
     }
 
-    if(!is_nhwc || _is_quantized)
+    if(!_skip_col2im)
     {
-        _tmp_output.allocator()->allocate();
+        // Configure and tune Col2Im
+        _col2im_kernel.configure(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
+        CLScheduler::get().tune_kernel_static(_col2im_kernel);
+    }
+
+    if(!_skip_col2im)
+    {
         _gemm_output.allocator()->allocate();
     }
 
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
@@ -336,16 +365,16 @@
     const unsigned int kernel_width  = weights->dimension(idx_width);
     const unsigned int kernel_height = weights->dimension(idx_height);
 
-    TensorInfo         im2col_reshaped_info, info_gemm, tmp_info, weights_reshaped_info;
-    const ITensorInfo *gemm_input_to_use         = input;
-    const ITensorInfo *gemm_output_to_use        = output;
-    const ITensorInfo *gemm_output_staged_to_use = output;
-    const ITensorInfo *weights_to_use            = weights;
+    TensorInfo         im2col_reshaped_info, info_gemm, weights_reshaped_info;
+    const ITensorInfo *gemm_input_to_use  = input;
+    const ITensorInfo *gemm_output_to_use = output;
+    const ITensorInfo *weights_to_use     = weights;
 
-    const bool is_nhwc      = data_layout == DataLayout::NHWC;
-    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
-    const bool skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1) && !is_quantized;
-    const bool append_bias  = (biases != nullptr) && (!is_quantized);
+    const bool is_quantized               = is_data_type_quantized_asymmetric(data_type);
+    const bool append_bias                = (biases != nullptr) && (!is_quantized);
+    const bool skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    const bool skip_col2im                = data_layout == DataLayout::NHWC;
+    bool       is_activationlayer_enabled = act_info.enabled();
 
     ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * num_groups) != input->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
@@ -407,47 +436,76 @@
     }
 
     // Create GEMM output tensor
-    if(!is_nhwc || is_quantized)
+    if(!skip_col2im)
     {
-        TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+        TensorShape shape_gemm;
+
+        shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
-        const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
-        info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
-        info_gemm.set_quantization_info(output->quantization_info());
+
+        info_gemm = TensorInfo(shape_gemm, 1, data_type);
+        info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
         gemm_output_to_use = &info_gemm;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, (data_layout == DataLayout::NHWC) ? conv_h : 1, skip_im2col));
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset     = 0;
+    gemmlowp_output_stage.gemmlowp_multiplier = 0;
+    gemmlowp_output_stage.gemmlowp_shift      = 0;
 
     if(is_quantized)
     {
-        float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
 
-        tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
-        tmp_info.set_quantization_info(output->quantization_info());
-        gemm_output_staged_to_use = &tmp_info;
+        const float multiplier  = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
 
-        // Validate output stage for quantized case
-        CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+
+        if(is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
+            is_activationlayer_enabled = false;
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
     }
 
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
+
     // Validate Col2Im
-    if(!is_nhwc || is_quantized)
+    if(!skip_col2im)
     {
-        if(input->data_layout() == DataLayout::NCHW)
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
-                                                                 output,
-                                                                 std::make_pair(conv_w, conv_h), num_groups));
-        }
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups));
     }
 
     //Validate Activation Layer
-    if(act_info.enabled())
+    if(is_activationlayer_enabled)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
     }
@@ -472,9 +530,6 @@
     {
         // Run gemmlowp
         _mm_gemmlowp.run();
-
-        // Run output stage
-        _gemmlowp_output_stage.run();
     }
     else
     {
@@ -488,16 +543,9 @@
     }
 
     // Reshape output matrix
-    if(_data_layout == DataLayout::NCHW || _is_quantized)
+    if(!_skip_col2im)
     {
-        if(_data_layout == DataLayout::NCHW)
-        {
-            CLScheduler::get().enqueue(_col2im_kernel, false);
-        }
-        else
-        {
-            _reshape_layer.run();
-        }
+        CLScheduler::get().enqueue(_col2im_kernel, false);
     }
 
     //Run Activation Layer if enabled

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 0ce07c3..2d4d231 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -41,8 +41,11 @@
 {
     bool flag = true;
 
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target,
+                        GPUTarget::G71, GPUTarget::G72,
+                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT))
     {
+        // COMPMID-852
         if(k > 256 && m > 4 && reshape_b_only_on_first_run)
         {
             flag = ((0.72f + n * 0.10766f) < (n * 0.1284f));
@@ -52,6 +55,10 @@
             flag = false;
         }
     }
+    else
+    {
+        flag = m > 1;
+    }
 
     return flag;
 }
@@ -65,24 +72,26 @@
       _mtx_a_reduction_kernel(),
       _mtx_b_reduction_kernel(),
       _offset_contribution_kernel(),
+      _offset_contribution_output_stage_kernel(),
       _vector_sum_col(),
       _vector_sum_row(),
       _tmp_a(),
       _tmp_b(),
+      _mm_result_s32(),
       _original_b(nullptr),
       _a_offset(0),
       _b_offset(0),
       _is_interleaved_transposed(true),
       _reshape_b_only_on_first_run(false),
-      _is_prepared(false)
+      _is_prepared(false),
+      _fuse_output_stage(false)
 {
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_UNUSED(gemm_info);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
 
     _is_prepared                 = false;
     _original_b                  = b;
@@ -103,9 +112,12 @@
     // Arguments used by GEMMReshapeInfo
     // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
     // in order to know how the matrices have been reshaped
-    const int     m                         = a->info()->dimension(1);
+    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const bool    unroll_block              = dot8_supported(CLKernelLibrary::get().get_device());
+    const int     m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const int     n                         = b->info()->dimension(0);
     const int     k                         = a->info()->dimension(0);
+    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
     constexpr int mult_transpose1xW_width   = 1;
     constexpr int mult_interleave4x4_height = 1;
 
@@ -114,6 +126,9 @@
 
     if(_is_interleaved_transposed)
     {
+        // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
+        reinterpret_input_as_3d = false;
+
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
@@ -124,15 +139,12 @@
         }
 
         // Configure interleave kernel
-        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
+        _mtx_a_reshape_kernel.configure(a, &_tmp_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d(), unroll_block);
 
         // Configure transpose kernel
         _mtx_b_reshape_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
     }
 
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
-
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0)
     {
@@ -158,8 +170,34 @@
         _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
     }
 
-    // Configure offset contribution kernel
-    _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        _fuse_output_stage = true;
+
+        _memory_group.manage(&_mm_result_s32);
+
+        // Configure matrix multiply kernel
+        _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+                                                                                                              mult_transpose1xW_width, mult_interleave4x4_height,
+                                                                                                              depth_output_gemm3d, reinterpret_input_as_3d));
+
+        // Configure offset contribution kernel
+        _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
+                                                           _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+
+        _mm_result_s32.allocator()->allocate();
+    }
+    else
+    {
+        // Configure matrix multiply kernel
+        _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k,
+                                                                                                     mult_transpose1xW_width, mult_interleave4x4_height,
+                                                                                                     depth_output_gemm3d, reinterpret_input_as_3d));
+
+        // Configure offset contribution kernel
+        _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, a->info()->dimension(0), _a_offset, _b_offset);
+    }
 
     // Allocate tensors
     if(_is_interleaved_transposed)
@@ -182,45 +220,52 @@
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
-                                    "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
-                                    "The output matrix must have the same number of columns as the matrix B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
     int32_t a_offset = a->quantization_info().offset;
     int32_t b_offset = b->quantization_info().offset;
 
-    const int             m                         = a->dimension(1);
-    const int             n                         = b->dimension(0);
-    const int             k                         = a->dimension(0);
-    constexpr int         mult_transpose1xW_width   = 1;
-    constexpr int         mult_interleave4x4_height = 1;
-    const int             depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
-    const GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d);
+    const ITensorInfo *matrix_a_info = a;
+    const ITensorInfo *matrix_b_info = b;
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    bool          reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
+    const int     m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const int     n                         = b->dimension(0);
+    const int     k                         = a->dimension(0);
+    constexpr int mult_transpose1xW_width   = 1;
+    constexpr int mult_interleave4x4_height = 1;
+    const int     depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
 
     bool reshape_matrices = is_interleaved_transposed(m, n, k, gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
+    // if reshape_matrices is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
     if(reshape_matrices)
     {
-        TensorInfo info_a(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()), 1, a->data_type());
-        TensorInfo info_b(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width), 1, b->data_type());
-
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, mult_transpose1xW_width));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output, reshape_matrices, reshape_info));
+        reinterpret_input_as_3d = false;
     }
-    else
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    if(reshape_matrices)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output, reshape_matrices, reshape_info));
+        matrix_a_info = &tmp_a_info;
+        matrix_b_info = &tmp_b_info;
+
+        // Validate interleave kernel
+        auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &tmp_a_info, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()));
+
+        // Validate transpose kernel
+        auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &tmp_b_info, mult_transpose1xW_width));
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;
@@ -243,11 +288,37 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
     }
 
-    // Validate offset contribution kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
-                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                             a_offset, b_offset));
+    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        TensorInfo mm_result_s32_info{};
+
+        // Output tensor auto inizialitation if not yet initialized
+        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_matrices, reshape_info)).set_data_type(DataType::S32));
+
+        // Validate matrix multiply
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_matrices, reshape_info));
+
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
+                                                                                            a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                            b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                            c,
+                                                                                            output,
+                                                                                            a_offset, b_offset,
+                                                                                            gemm_info.gemmlowp_output_stage()));
+    }
+    else
+    {
+        // Validate matrix multiply
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_matrices, reshape_info));
+
+        // Validate offset contribution kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                                 c,
+                                                                                 a_offset, b_offset));
+    }
 
     return Status{};
 }
@@ -285,8 +356,16 @@
         CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
     }
 
-    // Run offset contribution kernel
-    CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+    if(_fuse_output_stage)
+    {
+        // Run offset contribution/output stage kernel
+        CLScheduler::get().enqueue(_offset_contribution_output_stage_kernel, true);
+    }
+    else
+    {
+        // Run offset contribution kernel
+        CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+    }
 
     _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 16d8678..f1282cb 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,12 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
@@ -42,15 +43,33 @@
     return CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
 }
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                                                                    int result_offset_after_shift, int min, int max)
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                                    int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
+                                                                    int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
     k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
     _kernel = std::move(k);
 }
 
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                     int min, int max)
 {
     return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
\ No newline at end of file
+}
+
+void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
+                                                               float multiplier, int offset,
+                                                               int min, int max)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel>();
+    k->configure(input, bias, output, multiplier, offset, min, max);
+    _kernel = std::move(k);
+}
+
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
+                                                                int min, int max)
+{
+    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
new file mode 100644
index 0000000..5dd1202
--- /dev/null
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _permute_deltas_kernel(),
+      _flatten_deltas_kernel(),
+      _permute_scores_kernel(),
+      _flatten_scores_kernel(),
+      _compute_anchors_kernel(),
+      _bounding_box_kernel(),
+      _memset_kernel(),
+      _padded_copy_kernel(),
+      _cpp_nms_kernel(),
+      _deltas_permuted(),
+      _deltas_flattened(),
+      _scores_permuted(),
+      _scores_flattened(),
+      _all_anchors(),
+      _all_proposals(),
+      _keeps_nms_unused(),
+      _classes_nms_unused(),
+      _proposals_4_roi_values(),
+      _num_valid_proposals(nullptr),
+      _scores_out(nullptr)
+{
+}
+
+void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+
+    const DataType data_type         = deltas->info()->data_type();
+    const int      num_anchors       = scores->info()->dimension(2);
+    const int      feat_width        = scores->info()->dimension(0);
+    const int      feat_height       = scores->info()->dimension(1);
+    const int      total_num_anchors = num_anchors * feat_width * feat_height;
+    const int      pre_nms_topN      = info.pre_nms_topN();
+    const int      post_nms_topN     = info.post_nms_topN();
+    const size_t   values_per_roi    = info.values_per_roi();
+
+    // Compute all the anchors
+    _memory_group.manage(&_all_anchors);
+    _compute_anchors_kernel.configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+
+    const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
+    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, data_type));
+
+    // Permute and reshape deltas
+    _memory_group.manage(&_deltas_permuted);
+    _memory_group.manage(&_deltas_flattened);
+    _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+    _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+    _deltas_permuted.allocator()->allocate();
+
+    const TensorShape flatten_shape_scores(1, total_num_anchors);
+    _scores_flattened.allocator()->init(TensorInfo(flatten_shape_scores, 1, data_type));
+
+    // Permute and reshape scores
+    _memory_group.manage(&_scores_permuted);
+    _memory_group.manage(&_scores_flattened);
+    _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+    _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+    _scores_permuted.allocator()->allocate();
+
+    // Bounding box transform
+    _memory_group.manage(&_all_proposals);
+    BoundingBoxTransformInfo bbox_info(info.im_width(), info.im_height(), 1.f);
+    _bounding_box_kernel.configure(&_all_anchors, &_all_proposals, &_deltas_flattened, bbox_info);
+    _deltas_flattened.allocator()->allocate();
+    _all_anchors.allocator()->allocate();
+
+    // The original layer implementation first selects the best pre_nms_topN anchors (thus having a lightweight sort)
+    // that are then transformed by bbox_transform. The boxes generated are then fed into a non-sorting NMS operation.
+    // Since we are reusing the NMS layer and we don't implement any CL/sort, we let NMS do the sorting (of all the input)
+    // and the filtering
+    const int   scores_nms_size = std::min<int>(std::min<int>(post_nms_topN, pre_nms_topN), total_num_anchors);
+    const float min_size_scaled = info.min_size() * info.im_scale();
+    _memory_group.manage(&_classes_nms_unused);
+    _memory_group.manage(&_keeps_nms_unused);
+
+    // Note that NMS needs outputs preinitialized.
+    auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, data_type);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, data_type);
+    auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
+
+    // Initialize temporaries (unused) outputs
+    _classes_nms_unused.allocator()->init(TensorInfo(TensorShape(1, 1), 1, data_type));
+    _keeps_nms_unused.allocator()->init(*scores_out->info());
+
+    // Save the output (to map and unmap them at run)
+    _scores_out          = scores_out;
+    _num_valid_proposals = num_valid_proposals;
+
+    _memory_group.manage(&_proposals_4_roi_values);
+    _cpp_nms_kernel.configure(&_scores_flattened, &_all_proposals, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                              BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _keeps_nms_unused.allocator()->allocate();
+    _classes_nms_unused.allocator()->allocate();
+    _all_proposals.allocator()->allocate();
+    _scores_flattened.allocator()->allocate();
+
+    // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
+    _padded_copy_kernel.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _proposals_4_roi_values.allocator()->allocate();
+
+    _memset_kernel.configure(proposals, PixelValue());
+}
+
+Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(scores, DataLayout::NCHW);
+
+    const int num_anchors       = scores->dimension(2);
+    const int feat_width        = scores->dimension(0);
+    const int feat_height       = scores->dimension(1);
+    const int num_images        = scores->dimension(3);
+    const int total_num_anchors = num_anchors * feat_width * feat_height;
+    const int values_per_roi    = info.values_per_roi();
+
+    ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
+
+    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
+
+    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+
+    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPermuteKernel::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+
+    TensorInfo scores_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, BoundingBoxTransformInfo(info.im_width(), info.im_height(),
+                                                                       1.f)));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(&proposals_4_roi_values, proposals, PaddingList{ { 0, 1 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(proposals, PixelValue()));
+
+    if(num_valid_proposals->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
+    }
+
+    if(proposals->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(proposals, deltas);
+    }
+
+    if(scores_out->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores_out, scores);
+    }
+
+    return Status{};
+}
+
+void CLGenerateProposalsLayer::run_cpp_nms_kernel()
+{
+    // Map inputs
+    _scores_flattened.map(true);
+    _all_proposals.map(true);
+
+    // Map outputs
+    _scores_out->map(CLScheduler::get().queue(), true);
+    _proposals_4_roi_values.map(CLScheduler::get().queue(), true);
+    _num_valid_proposals->map(CLScheduler::get().queue(), true);
+    _keeps_nms_unused.map(true);
+    _classes_nms_unused.map(true);
+
+    // Run nms
+    CPPScheduler::get().schedule(&_cpp_nms_kernel, Window::DimX);
+
+    // Unmap outputs
+    _keeps_nms_unused.unmap();
+    _classes_nms_unused.unmap();
+    _scores_out->unmap(CLScheduler::get().queue());
+    _proposals_4_roi_values.unmap(CLScheduler::get().queue());
+    _num_valid_proposals->unmap(CLScheduler::get().queue());
+
+    // Unmap inputs
+    _scores_flattened.unmap();
+    _all_proposals.unmap();
+}
+
+void CLGenerateProposalsLayer::run()
+{
+    // Acquire all the temporaries
+    _memory_group.acquire();
+
+    // Compute all the anchors
+    CLScheduler::get().enqueue(_compute_anchors_kernel, false);
+
+    // Transpose and reshape the inputs
+    CLScheduler::get().enqueue(_permute_deltas_kernel, false);
+    CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
+    CLScheduler::get().enqueue(_permute_scores_kernel, false);
+    CLScheduler::get().enqueue(_flatten_scores_kernel, false);
+
+    // Build the boxes
+    CLScheduler::get().enqueue(_bounding_box_kernel, false);
+    // Non maxima suppression
+    run_cpp_nms_kernel();
+    // Add dummy batch indexes
+    CLScheduler::get().enqueue(_memset_kernel, true);
+    CLScheduler::get().enqueue(_padded_copy_kernel, true);
+
+    // Release all the temporaries
+    _memory_group.release();
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 423faea..65ce7de 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp

@@ -65,7 +65,7 @@
                                 float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
                                 BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
 {
-    ARM_COMPUTE_UNUSED(use_fp16);
+    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-772): Add half float support
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index a3010a7..4f709d5 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -63,8 +63,8 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, axis, ReductionOperation::SUM_SQUARE));
 
-    // Reduce shape on axis (supported axis is 0)
-    shape.set(0, 1);
+    // Reduce shape on axis
+    shape.set(axis, 1);
     sum_sq.set_tensor_shape(shape);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLL2NormalizeLayerKernel::validate(input, &sum_sq, output, axis, epsilon));

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 3458135..a89c4e3 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -130,7 +130,6 @@
         _forget_gate_out3.allocator()->allocate();
     }
     _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    forget_gate_out->allocator()->allocate();
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -195,7 +194,6 @@
     _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
     _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _input_gate_out1.allocator()->allocate();
     _cell_state_out4.allocator()->allocate();
     _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     _forget_gate_out1.allocator()->allocate();
@@ -246,7 +244,6 @@
         _output1.allocator()->allocate();
     }
     _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
-    output_gate_out->allocator()->allocate();
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -281,12 +278,11 @@
 
     // Copy cell state and output
     _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
-    _cell_state_out1.allocator()->allocate();
     _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<ICLTensor *> scratch_inputs;
-    if(lstm_params.has_cifg_opt())
+    if(!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(&_input_gate_out1);
     }
@@ -294,6 +290,10 @@
     scratch_inputs.emplace_back(forget_gate_out);
     scratch_inputs.emplace_back(output_gate_out);
     _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+    _input_gate_out1.allocator()->allocate();
+    _cell_state_out1.allocator()->allocate();
+    forget_gate_out->allocator()->allocate();
+    output_gate_out->allocator()->allocate();
 }
 
 Status CLLSTMLayer::validate(const ITensorInfo *input,
@@ -444,7 +444,7 @@
 
     // Validate scratch concatenation
     std::vector<ITensorInfo *> inputs_vector_info_raw;
-    if(lstm_params.has_cifg_opt())
+    if(!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 40bf032..5c6bef9 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -122,7 +122,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
 
     return Status{};
 }
@@ -163,7 +163,7 @@
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();

diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
new file mode 100644
index 0000000..11d70e3
--- /dev/null
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp

@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLNormalizePlanarYUVLayer::CLNormalizePlanarYUVLayer()
+    : _norm_kernel()
+{
+}
+
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+{
+    _norm_kernel.configure(input, output, mean, std);
+}
+
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ITensorInfo *mean, const ITensorInfo *std)
+{
+    return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
+}
+
+void CLNormalizePlanarYUVLayer::run()
+{
+    CLScheduler::get().enqueue(_norm_kernel, true);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
new file mode 100644
index 0000000..de43c7d
--- /dev/null
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLPadLayer::CLPadLayer()
+    : _copy_kernel(), _fillborder_kernel(), _memset_kernel()
+{
+}
+
+void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding)
+{
+    // Copy the input to the output
+    _copy_kernel.configure(input, output, padding);
+
+    // Set the pages of the output to zero
+    _memset_kernel.configure(output, PixelValue());
+
+    // Fill padding on the first two dimensions with zeros
+    _fillborder_kernel.configure(input, input->info()->padding(), BorderMode::CONSTANT);
+}
+
+Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(input, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, padding));
+
+    return Status{};
+}
+
+void CLPadLayer::run()
+{
+    CLScheduler::get().enqueue(_memset_kernel, false);
+    CLScheduler::get().enqueue(_fillborder_kernel, false);
+    CLScheduler::get().enqueue(_copy_kernel, true);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
new file mode 100644
index 0000000..4f6c969
--- /dev/null
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLPriorBoxLayer::CLPriorBoxLayer()
+    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+{
+}
+
+void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+{
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
+    if(!info.max_sizes().empty())
+    {
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+    }
+
+    auto k = arm_compute::support::cpp14::make_unique<CLPriorBoxLayerKernel>();
+    k->configure(input1, input2, output, info, &_min, &_max, &_aspect_ratios);
+    _kernel = std::move(k);
+}
+
+Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
new file mode 100644
index 0000000..5bfd594
--- /dev/null
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
+
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
+
+    return Status{};
+}
+
+void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+{
+    // Configure ROI pooling kernel
+    auto k = arm_compute::support::cpp14::make_unique<CLROIAlignLayerKernel>();
+    k->configure(input, rois, output, pool_info);
+    _kernel = std::move(k);
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
new file mode 100644
index 0000000..1016ff7
--- /dev/null
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReduceMean.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+{
+}
+void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _reduction_ops     = reduction_axis.num_dimensions();
+    _reduction_kernels = arm_compute::support::cpp14::make_unique<CLReductionOperation[]>(_reduction_ops);
+    _reduced_outs      = arm_compute::support::cpp14::make_unique<CLTensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims         = keep_dims;
+
+    // Perform reduction for every axis
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        out_shape.set(reduction_axis[i], 1);
+        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+        if(i == _reduction_ops - 1 && keep_dims)
+        {
+            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+        }
+        else
+        {
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type(), input->info()->quantization_info()));
+            _memory_group.manage(_reduced_outs.get() + i);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+        }
+    }
+
+    // Allocate intermediate tensors
+    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    {
+        _reduced_outs[i].allocator()->allocate();
+    }
+
+    // Configure reshape layer if we want to drop the dimensions
+    if(!keep_dims)
+    {
+        TensorShape out_shape = input->info()->tensor_shape();
+
+        // We have to sort the reduction axis vectors in order for remove_dimension
+        // to work properly
+        Coordinates axis_copy = reduction_axis;
+        std::sort(axis_copy.begin(), axis_copy.begin() + _reduction_ops);
+        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        {
+            out_shape.remove_dimension(axis_copy[i] - i);
+        }
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+    }
+}
+
+Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(keep_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis[i] > 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+        if(output->total_size() > 0 && keep_dims)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+        }
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+    }
+
+    return Status{};
+}
+
+void CLReduceMean::run()
+{
+    _memory_group.acquire();
+
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        _reduction_kernels[i].run();
+    }
+
+    if(!_keep_dims)
+    {
+        _reshape.run();
+    }
+    _memory_group.release();
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 2a171c3..c5447ff 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -37,8 +37,13 @@
 
 namespace
 {
-unsigned int calculate_number_of_stages(const ITensorInfo *input)
+unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int axis)
 {
+    // We need only 1 stage for all axis except x-axis and x-axis for QASYMM8.
+    if(axis != 0 || (axis == 0 && is_data_type_quantized(input->data_type())))
+    {
+        return 1;
+    }
     // Calculate number of WGs. 16 elements per thread, 8 threads per WG
     const unsigned int num_of_wg = ceil(input->dimension(0) / 128.f);
 
@@ -51,91 +56,149 @@
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages()
+    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
 {
 }
 
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
-    const unsigned int num_of_stages = calculate_number_of_stages(input);
+    const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
 
-    // Create temporary tensor infos
-    auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
-
-    // Create intermediate tensor info
-    TensorShape shape{ input->tensor_shape() };
-
-    for(unsigned int i = 0; i < num_of_stages - 1; i++)
+    if(axis == 0 && !is_data_type_quantized(input->data_type()))
     {
-        shape.set(0, ceil(shape.x() / 128.f));
-        sums_vector[i].set_data_type(input->data_type());
-        sums_vector[i].set_tensor_shape(shape);
-        sums_vector[i].set_num_channels(input->num_channels());
+        // Create temporary tensor infos
+        auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
+
+        // Create intermediate tensor info
+        TensorShape shape{ input->tensor_shape() };
+
+        for(unsigned int i = 0; i < num_of_stages - 1; i++)
+        {
+            shape.set(0, ceil(shape.x() / 128.f));
+            sums_vector[i].set_data_type(input->data_type());
+            sums_vector[i].set_tensor_shape(shape);
+            sums_vector[i].set_num_channels(input->num_channels());
+        }
+
+        ReductionOperation first_kernel_op;
+        ReductionOperation last_kernel_op;
+        switch(op)
+        {
+            case ReductionOperation::SUM:
+            case ReductionOperation::MEAN_SUM:
+                first_kernel_op = ReductionOperation::SUM;
+                last_kernel_op  = op;
+                break;
+            case ReductionOperation::SUM_SQUARE:
+                first_kernel_op = ReductionOperation::SUM_SQUARE;
+                last_kernel_op  = ReductionOperation::SUM;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
+
+        // Validate ReductionOperation only on first kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, first_kernel_op));
+
+        // Validate ReductionOperation on intermediate stages
+        for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+        }
+
+        // Validate ReductionOperation on the last stage
+        const unsigned int last_stage = num_of_stages - 1;
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
     }
-
-    // Validate ReductionOperation only on first kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, sums_vector.get(), axis, op));
-
-    // Validate ReductionOperation on intermediate stages
-    for(unsigned int i = 1; i < num_of_stages - 1; ++i)
+    else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, op));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
     }
 
-    // Validate ReductionOperation on the last stage
-    const unsigned int last_stage = num_of_stages - 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, op));
-
     return Status{};
 }
 
 void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
 {
-    _num_of_stages = calculate_number_of_stages(input->info());
-
-    // Create temporary tensors
-    _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+    _num_of_stages  = calculate_number_of_stages(input->info(), axis);
+    _reduction_axis = axis;
+    _is_quantized   = is_data_type_quantized(input->info()->data_type());
 
     // Configure reduction operation kernels
     _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
-    _border_handlers_vector   = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
 
-    TensorShape shape{ input->info()->tensor_shape() };
-    for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+    // Create temporary tensors
+    if(axis == 0 && !_is_quantized)
     {
-        shape.set(0, ceil(shape.x() / 128.f));
-        _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+        _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
+        _sums_vector            = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        TensorShape shape{ input->info()->tensor_shape() };
+        for(unsigned int i = 0; i < _num_of_stages - 1; i++)
+        {
+            shape.set(0, ceil(shape.x() / 128.f));
+            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+        }
+
+        // Apply ReductionOperation only on first kernel
+        _memory_group.manage(_sums_vector.get());
+
+        ReductionOperation first_kernel_op;
+        ReductionOperation last_kernel_op;
+        switch(op)
+        {
+            case ReductionOperation::SUM:
+            case ReductionOperation::MEAN_SUM:
+                first_kernel_op = ReductionOperation::SUM;
+                last_kernel_op  = op;
+                break;
+            case ReductionOperation::SUM_SQUARE:
+                first_kernel_op = ReductionOperation::SUM_SQUARE;
+                last_kernel_op  = ReductionOperation::SUM;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Not supported");
+        }
+
+        _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
+        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+
+        // Apply ReductionOperation on intermediate stages
+        for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+        {
+            _memory_group.manage(_sums_vector.get() + i);
+            _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
+            _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
+            _sums_vector[i - 1].allocator()->allocate();
+        }
+
+        // Apply ReductionOperation on the last stage
+        const unsigned int last_stage  = _num_of_stages - 1;
+        const unsigned int input_width = input->info()->dimension(0);
+        _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _sums_vector[last_stage - 1].allocator()->allocate();
     }
-
-    // Apply ReductionOperation only on first kernel
-    _memory_group.manage(_sums_vector.get());
-    _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, op);
-    _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
-
-    // Apply ReductionOperation on intermediate stages
-    for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
+    else
     {
-        _memory_group.manage(_sums_vector.get() + i);
-        _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
-        _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-        _sums_vector[i - 1].allocator()->allocate();
+        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
     }
-
-    // Apply ReductionOperation on the last stage
-    const unsigned int last_stage = _num_of_stages - 1;
-    _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, ReductionOperation::SUM);
-    _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
-    _sums_vector[last_stage - 1].allocator()->allocate();
 }
 
 void CLReductionOperation::run()
 {
     _memory_group.acquire();
 
-    for(unsigned int i = 0; i < _num_of_stages; ++i)
+    if(_reduction_axis == 0 && !_is_quantized)
     {
-        CLScheduler::get().enqueue(_border_handlers_vector[i], false);
-        CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+        for(unsigned int i = 0; i < _num_of_stages; ++i)
+        {
+            CLScheduler::get().enqueue(_border_handlers_vector[i], false);
+            CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
+        }
+    }
+    else
+    {
+        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
     }
 
     _memory_group.release();

diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
new file mode 100644
index 0000000..8e04d16
--- /dev/null
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void CLReorgLayer::configure(ICLTensor *input, ICLTensor *output, int32_t stride)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReorgLayerKernel>();
+    k->configure(input, output, stride);
+    _kernel = std::move(k);
+}
+
+Status CLReorgLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    return CLReorgLayerKernel::validate(input, output, stride);
+}

diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 2ce83dc..b98a99d 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "support/ToolchainSupport.h"
 
+/** [CLReshapeLayer snippet] **/
 using namespace arm_compute;
 
 void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
@@ -35,3 +36,9 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status CLReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLReshapeLayerKernel::validate(input, output);
+}
+/** [CLReshapeLayer snippet] **/

diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 4ff9763..f204e64 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp

@@ -50,3 +50,9 @@
     }
     _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
 }
+
+Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+    ARM_COMPUTE_UNUSED(constant_border_value);
+    return CLScaleKernel::validate(input, output, policy, border_mode, sampling_policy);
+}

diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
new file mode 100644
index 0000000..bef7eca
--- /dev/null
+++ b/src/runtime/CL/functions/CLSlice.cpp

@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSlice.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Get absolute end coordinates
+    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->info()->tensor_shape(), ends);
+
+    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+    k->configure(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+    _kernel = std::move(k);
+}
+
+Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+
+    // Check start dimensions for being non-negative
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
+    {
+        return i < 0;
+    }));
+
+    // Get absolute end coordinates
+    const Coordinates ends_abs = arm_compute::helpers::tensor_transform::slice_absolute_end_coords(input->tensor_shape(), ends);
+
+    return CLStridedSliceKernel::validate(input, output, starts, ends_abs, BiStrides(), 0, 0, 0);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7a20d9f..d671846 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -29,29 +29,80 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLSoftmaxLayer::CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
+    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
+      _needs_flattening(false)
 {
 }
 
-void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta)
+void CLSoftmaxLayer::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+{
+    // Flatten the input
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+
+    // Initialize the flat input
+    _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
+
+    // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
+    // If flattening on the third axes, we use CLFlattenKernel.
+    // In all other cases we have to use CLReshapeKernel
+    if(axis != 3)
+    {
+        auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
+        reshape_kernel_ptr->configure(input, &_input_flattened);
+        _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
+    }
+    else
+    {
+        auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
+        flatten_kernel_ptr->configure(input, &_input_flattened);
+        _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
+    }
+
+    // We need to init the output tensor here. Indeed, the reshape kernel expects
+    // both tensors to be already initialized
+    auto_init_if_empty(*output->info(), *input->info()->clone());
+}
+
+void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float beta, size_t axis)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayer::validate(input->info(), output->info(), beta, axis));
+
+    // We don't need flattening only in the case the input is 2D and axis is 1
+    _needs_flattening = axis != 1;
+
+    // If we are dealing with a 4D tensor, we will:
+    // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
+    // - Execute all the pipeline (reduction + normalization) on the flattened tensor
+    // - Reshape the flattened output into the real output
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _input_flattened
+        _memory_group.manage(&_input_flattened);
+
+        // Cofigure  _flatten_kernel and _input_flattened
+        configure_reshape_input_kernel(input, output, axis);
+    }
+
+    // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
+    // or it is the original input case (2D case)
+    const ICLTensor *input_2D = (_needs_flattening ? &_input_flattened : input);
 
     // Create intermediate tensors shapes
-    const TensorInfo input_info    = input->info()->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(input->info()->data_type()) ? DataType::S32 : input->info()->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    TensorInfo input_info    = input_2D->info()->clone()->reset_padding().set_is_resizable(true);
+    DataType   tmp_data_type = is_data_type_quantized_asymmetric(input_2D->info()->data_type()) ? DataType::S32 : input_2D->info()->data_type();
+    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
     _tmp.allocator()->init(tensor_info_tmp);
 
-    TensorShape max_sum_shape = input->info()->tensor_shape();
+    TensorShape max_sum_shape = input_2D->info()->tensor_shape();
     max_sum_shape.set(0, 1);
     _max.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape));
     _sum.allocator()->init(input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type));
@@ -65,8 +116,28 @@
     _memory_group.manage(&_sum);
 
     // Configure kernels
-    _max_shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum, beta);
-    _norm_kernel.configure(&_tmp, &_sum, output, beta);
+    _max_shift_exp_sum_kernel.configure(input_2D, &_max, &_tmp, &_sum, beta);
+
+    if(_needs_flattening)
+    {
+        // Add to the memory manager _output_flattened
+        _memory_group.manage(&_output_flattened);
+
+        // The normalization kernel stores the result in a flat output tensor
+        _norm_kernel.configure(&_tmp, &_sum, &_output_flattened, beta);
+
+        // Reshape the flat output into a the requested (4D) output
+        _reshape_kernel.configure(&_output_flattened, output);
+
+        // Allocate the intermediate flat tensors
+        _input_flattened.allocator()->allocate();
+        _output_flattened.allocator()->allocate();
+    }
+    else
+    {
+        // Softmax 2D case
+        _norm_kernel.configure(&_tmp, &_sum, output, beta);
+    }
 
     // Allocate intermediate buffers
     _tmp.allocator()->allocate();
@@ -74,10 +145,11 @@
     _sum.allocator()->allocate();
 }
 
-Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLSoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_UNUSED(beta);
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -88,9 +160,32 @@
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
+    const bool needs_flattening = (axis != 1);
+
+    if(needs_flattening)
+    {
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+
+        if(axis != 3)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+        }
+    }
+
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DMaxShiftExpSumKernel::validate(input, &tensor_info_max, &tensor_info_tmp, &tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLogits1DNormKernel::validate(&tensor_info_tmp, &tensor_info_sum, output));
 
+    if(needs_flattening)
+    {
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input);
+        TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
+    }
+
     return Status{};
 }
 
@@ -98,8 +193,21 @@
 {
     _memory_group.acquire();
 
-    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
-    CLScheduler::get().enqueue(_norm_kernel);
+    if(_needs_flattening)
+    {
+        CLScheduler::get().enqueue(*_flatten_kernel_ptr, false);
+    }
 
+    CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
+    CLScheduler::get().enqueue(_norm_kernel, !_needs_flattening);
+
+    if(_needs_flattening)
+    {
+        CLScheduler::get().enqueue(_reshape_kernel, true);
+    }
+
+    // Relase intermediate buffers
     _memory_group.release();
 }
+
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
new file mode 100644
index 0000000..76c1e18
--- /dev/null
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLSpaceToBatchLayer::CLSpaceToBatchLayer()
+    : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+{
+}
+
+void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+    }
+
+    _output = output;
+    _space_to_batch_kernel.configure(input, block_shape, paddings, output);
+}
+
+void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    {
+        _has_padding = true;
+    }
+
+    _output = output;
+    _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+{
+    return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+}
+
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+                                     const ITensorInfo *output)
+{
+    return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+}
+
+void CLSpaceToBatchLayer::run()
+{
+    // Zero out output only if we have paddings
+    // TODO(micspy01): replace with memset once ready
+    if(_has_padding)
+    {
+        _output->map(CLScheduler::get().queue(), true);
+        if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
+        {
+            const uint8_t quantized_zero = _output->info()->quantization_info().offset;
+            std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
+        }
+        else
+        {
+            memset(_output->buffer(), 0, _output->info()->total_size());
+        }
+        _output->unmap(CLScheduler::get().queue());
+    }
+
+    CLScheduler::get().enqueue(_space_to_batch_kernel, true);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
new file mode 100644
index 0000000..f084351
--- /dev/null
+++ b/src/runtime/CL/functions/CLSplit.cpp

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLSplit.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+CLSplit::CLSplit()
+    : _outputs_vector(), _slice_functions(), _num_outputs(0)
+{
+}
+
+void CLSplit::configure(const ICLTensor *input, const std::vector<ICLTensor *> &outputs, unsigned int axis)
+{
+    // Create Slice functions
+    _num_outputs     = outputs.size();
+    _slice_functions = arm_compute::support::cpp14::make_unique<CLSlice[]>(_num_outputs);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs);
+
+    // Extract output tensor info
+    std::vector<ITensorInfo *> outputs_info;
+    for(auto &output : outputs)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+        outputs_info.emplace_back(output->info());
+    }
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(CLSplit::validate(input->info(), outputs_info, axis));
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    for(unsigned int i = 0; i < _num_outputs; i++)
+    {
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        // Configure slice function
+        _slice_functions[i].configure(input, outputs[i], start_coords, end_coords);
+
+        // Set valid region from shape
+        outputs[i]->info()->set_valid_region(ValidRegion(Coordinates(), output_shape));
+
+        // Update axis offset
+        axis_offset += axis_split_step;
+    }
+}
+
+Status CLSplit::validate(const ITensorInfo *input, const std::vector<ITensorInfo *> &outputs, unsigned int axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis >= input->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(outputs.size() < 2);
+
+    // Get output shape
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_split_shape(input, axis, outputs.size());
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
+
+    const size_t axis_split_step = output_shape[axis];
+    unsigned int axis_offset     = 0;
+
+    // Start/End coordinates
+    Coordinates start_coords;
+    Coordinates end_coords;
+    for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+    {
+        end_coords.set(d, -1);
+    }
+
+    // Validate output tensors
+    for(const auto &output : outputs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+        // Output auto inizialitation if not yet initialized
+        TensorInfo tmp_output_info = *output->clone();
+        auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+
+        // Update coordinate on axis
+        start_coords.set(axis, axis_offset);
+        end_coords.set(axis, axis_offset + axis_split_step);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(input, output, start_coords, end_coords));
+        axis_offset += axis_split_step;
+    }
+
+    return Status{};
+}
+
+void CLSplit::run()
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    for(unsigned i = 0; i < _num_outputs; ++i)
+    {
+        _slice_functions[i].run();
+    }
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
new file mode 100644
index 0000000..e34f653
--- /dev/null
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+
+#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>();
+    k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _kernel = std::move(k);
+}
+
+Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
new file mode 100644
index 0000000..1dad325
--- /dev/null
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLUpsampleLayer::CLUpsampleLayer() // NOLINT
+    : _upsample(),
+      _output(nullptr)
+{
+}
+
+Status CLUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                 const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+    return CLUpsampleLayerKernel::validate(input, output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::configure(ICLTensor *input, ICLTensor *output,
+                                const Size2D &info, const InterpolationPolicy upsampling_policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _output = output;
+    _upsample.configure(input, _output, info, upsampling_policy);
+}
+
+void CLUpsampleLayer::run()
+{
+    CLScheduler::get().enqueue(_upsample, false);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
index 5233ff4..46a2d80 100644
--- a/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLWidthConcatenateLayer.cpp

@@ -36,26 +36,46 @@
 
 CLWidthConcatenateLayer::CLWidthConcatenateLayer() // NOLINT
     : _concat_kernels_vector(),
+      _concat_x2_kernel(),
+      _concat_x4_kernel(),
       _num_inputs(0)
 {
 }
 
 Status CLWidthConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output) // NOLINT
 {
+    const unsigned int num_inputs = inputs_vector.size();
+
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     // Output auto inizialitation if not yet initialized
     TensorInfo  tmp_output_info = *output->clone();
     TensorShape output_shape    = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
     auto_init_if_empty(tmp_output_info, output_shape, 1, inputs_vector[0]->data_type());
 
-    unsigned int width_offset = 0;
-    for(const auto &input : inputs_vector)
+    switch(num_inputs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
-        width_offset += input->dimension(0);
+        case 2:
+            // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(inputs_vector[0], inputs_vector[1], &tmp_output_info));
+            break;
+        case 4:
+            // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3]);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate4TensorsKernel::validate(inputs_vector[0], inputs_vector[1], inputs_vector[2], inputs_vector[3], &tmp_output_info));
+            break;
+        default:
+            unsigned int width_offset = 0;
+            // Validate generic case of WidthConcatenate kernel
+            for(const auto &input : inputs_vector)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+                ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenateLayerKernel::validate(input, width_offset, &tmp_output_info));
+                width_offset += input->dimension(0);
+            }
+            break;
     }
 
     return Status{};
@@ -74,16 +94,30 @@
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
+
     ARM_COMPUTE_ERROR_THROW_ON(CLWidthConcatenateLayer::validate(inputs_vector_info, output->info()));
 
-    unsigned int width_offset = 0;
-
-    _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
-
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
-        width_offset += inputs_vector.at(i)->info()->dimension(0);
+        case 2:
+            // Configure WidthConcatenate2Tensors kernel
+            _concat_x2_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), output);
+            break;
+        case 4:
+            // Configure WidthConcatenate4Tensors kernel
+            _concat_x4_kernel.configure(inputs_vector.at(0), inputs_vector.at(1), inputs_vector.at(2), inputs_vector.at(3), output);
+            break;
+        default:
+            // Configure generic case WidthConcatenate kernels
+            _concat_kernels_vector = arm_compute::support::cpp14::make_unique<CLWidthConcatenateLayerKernel[]>(_num_inputs);
+
+            unsigned int width_offset = 0;
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                _concat_kernels_vector[i].configure(inputs_vector.at(i), width_offset, output);
+                width_offset += inputs_vector.at(i)->info()->dimension(0);
+            }
+            break;
     }
 }
 
@@ -91,8 +125,19 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_inputs; i++)
+    switch(_num_inputs)
     {
-        CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+        case 2:
+            CLScheduler::get().enqueue(_concat_x2_kernel, true);
+            break;
+        case 4:
+            CLScheduler::get().enqueue(_concat_x4_kernel, true);
+            break;
+        default:
+            for(unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                CLScheduler::get().enqueue(_concat_kernels_vector[i], true);
+            }
+            break;
     }
 }

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index a70389a..1abcb67 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -104,9 +104,9 @@
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
         ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
-
     const WinogradInfo winograd_info = WinogradInfo(output_tile,
                                                     kernel_size,
                                                     input_dims,
@@ -129,7 +129,8 @@
     _filter_transform.configure(weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+    _batched_mm.configure(&_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, GEMMLowpOutputStageInfo(),
+                                                                                                 (input->info()->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform.configure(&_batched_mm_output, biases, output, winograd_info);
@@ -161,6 +162,7 @@
     // Check if the Winograd configuration requires fast math
     if(!enable_fast_math)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
     }
 
@@ -184,7 +186,8 @@
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                                                                                                     GEMMLowpOutputStageInfo(), (input->data_type() == DataType::F16))));
 
     // Configure output transform
     ARM_COMPUTE_RETURN_ON_ERROR(CLWinogradOutputTransformKernel::validate(&batched_mm_output, biases, output, winograd_info));

diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
new file mode 100644
index 0000000..5a612ba
--- /dev/null
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
+
+#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLYOLOLayer::configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLYOLOLayerKernel>();
+    k->configure(input, output, act_info, num_classes);
+    _kernel = std::move(k);
+}
+
+Status CLYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    return CLYOLOLayerKernel::validate(input, output, act_info, num_classes);
+}

diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 2d52f33..187f52f 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp

@@ -132,9 +132,12 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning over 30 representative tensor shapes.
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target,
+                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT))
     {
-        if((k._convolved_dims.first == 7) || (k._convolved_dims.first == 14))
+        if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14))
         {
             lws_hint = cl::NDRange(1, 7, 1);
         }
@@ -153,7 +156,11 @@
     const GPUTarget gpu_target = k.get_target();
 
     // Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76) && k._kernel_dims.width == 11)
+    if(gpu_target_is_in(gpu_target,
+                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT)
+       && k._kernel_dims.width == 11)
     {
         const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
         if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
@@ -171,7 +178,10 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target,
+                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT))
     {
         lws_hint = cl::NDRange(1, 2, 1);
     }
@@ -186,7 +196,10 @@
 
     // Configure the local work size for Bifrost with a value obtained
     // via exhaustive autotuning for the MobileNets tensor shapes.
-    if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+    if(gpu_target_is_in(gpu_target,
+                        GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                        GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                        GPUTarget::G52, GPUTarget::G52LIT))
     {
         lws_hint = cl::NDRange(1, 1, 1);
     }
@@ -207,6 +220,8 @@
         case GPUTarget::G51:
         case GPUTarget::G51BIG:
         case GPUTarget::G51LIT:
+        case GPUTarget::G52:
+        case GPUTarget::G52LIT:
         case GPUTarget::G76:
             if(k._input1->info()->dimension(1) == 24)
             {
@@ -240,7 +255,10 @@
     // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
     if(k._input->info()->data_layout() == DataLayout::NCHW)
     {
-        if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
+        if(gpu_target_is_in(gpu_target,
+                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                            GPUTarget::G52, GPUTarget::G52LIT))
         {
             cl::NDRange gws = ICLKernel::gws_from_window(k.window());
             lws_hint        = cl::NDRange(gws[0], gws[1], 1);

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index de28b4f..2b179fd 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -215,7 +215,6 @@
     : _num_threads(num_threads_hint()),
       _threads(_num_threads - 1)
 {
-    get_cpu_configuration(_cpu_info);
 }
 
 void CPPScheduler::set_num_threads(unsigned int num_threads)
@@ -229,6 +228,7 @@
     return _num_threads;
 }
 
+#ifndef DOXYGEN_SKIP_THIS
 void CPPScheduler::run_workloads(std::vector<IScheduler::Workload> &workloads)
 {
     const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
@@ -263,6 +263,7 @@
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }
 }
+#endif /* DOXYGEN_SKIP_THIS */
 
 void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {

diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
new file mode 100644
index 0000000..2e10152
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
+
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                                                    ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CPPBoxWithNonMaximaSuppressionLimitKernel>();
+    k->configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    _kernel = std::move(k);
+}
\ No newline at end of file

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index 6c21086..ac19d08 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp

@@ -134,6 +134,9 @@
                 }
                 break;
             case 0xd0b: // A76
+            case 0xd06:
+            case 0xd0c:
+            case 0xd0d:
                 model = CPUModel::GENERIC_FP16_DOT;
                 break;
             default:
@@ -160,8 +163,8 @@
             std::string line;
             if(bool(getline(file, line)))
             {
-                const unsigned long midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
-                c                        = midr_to_model(midr & 0xffffffff);
+                const uint32_t midr = support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16);
+                c                   = midr_to_model(midr & 0xffffffff);
             }
         }
     }
@@ -170,11 +173,11 @@
 void populate_models_cpuinfo(std::vector<CPUModel> &cpusv)
 {
     // If "long-form" cpuinfo is present, parse that to populate models.
-    std::regex proc_regex("^processor.*(\\d+)$");
-    std::regex imp_regex("^CPU implementer.*0x(..)$");
-    std::regex var_regex("^CPU variant.*0x(.)$");
-    std::regex part_regex("^CPU part.*0x(...)$");
-    std::regex rev_regex("^CPU revision.*(\\d+)$");
+    std::regex proc_regex(R"(^processor.*(\d+)$)");
+    std::regex imp_regex(R"(^CPU implementer.*0x(..)$)");
+    std::regex var_regex(R"(^CPU variant.*0x(.)$)");
+    std::regex part_regex(R"(^CPU part.*0x(...)$)");
+    std::regex rev_regex(R"(^CPU revision.*(\d+)$)");
 
     std::ifstream file;
     file.open("/proc/cpuinfo", std::ios::in);
@@ -317,10 +320,12 @@
         hwcaps_fp16_support = true;
     }
 
+#if defined(__aarch64__)
     if((hwcaps & HWCAP_ASIMDDP) != 0)
     {
         hwcaps_dot_support = true;
     }
+#endif /* defined(__aarch64__) */
 
     const unsigned int max_cpus = get_max_cpus();
     cpuinfo.set_cpu_num(max_cpus);
@@ -334,17 +339,18 @@
         populate_models_cpuinfo(percpu);
     }
     int j(0);
-    // Update dot product and FP16 support if all CPUs support these features:
-    bool all_support_dot  = true;
-    bool all_support_fp16 = true;
+    // Update dot product and FP16 support if one of the CPUs support these features
+    // We assume that the system does not have mixed architectures
+    bool one_supports_dot  = false;
+    bool one_supports_fp16 = false;
     for(const auto &v : percpu)
     {
-        all_support_dot &= model_supports_dot(v);
-        all_support_fp16 &= model_supports_fp16(v);
+        one_supports_dot  = one_supports_dot || model_supports_dot(v);
+        one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v);
         cpuinfo.set_cpu_model(j++, v);
     }
-    cpuinfo.set_dotprod(all_support_dot || hwcaps_dot_support);
-    cpuinfo.set_fp16(all_support_fp16 || hwcaps_fp16_support);
+    cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support);
+    cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support);
 #else  /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */
     ARM_COMPUTE_UNUSED(cpuinfo);
 #endif /* !defined(BARE_METAL) && (defined(__arm__) || defined(__aarch64__)) */

diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
index cdd12c3..70a1f4f 100644
--- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp

@@ -22,10 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
 
 #include <cstddef>
 
@@ -34,24 +34,26 @@
 void *GCBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    auto *gl_buffer = new GLBufferWrapper();
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, gl_buffer->_ssbo_name));
+
+    auto *gl_ssbo_name = new GLuint;
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, *gl_ssbo_name));
     ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
     ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
 
-    return reinterpret_cast<void *>(gl_buffer);
+    return reinterpret_cast<void *>(gl_ssbo_name);
 }
 
 void GCBufferAllocator::free(void *ptr)
 {
     ARM_COMPUTE_ERROR_ON(ptr == nullptr);
-    auto *gl_buffer = reinterpret_cast<GLBufferWrapper *>(ptr);
-    delete gl_buffer;
+    auto *gl_ssbo_name = reinterpret_cast<GLuint *>(ptr);
+    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, gl_ssbo_name));
+    delete gl_ssbo_name;
 }
 
 std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
 {
-    ARM_COMPUTE_UNUSED(size, alignment);
-    return nullptr;
+    ARM_COMPUTE_UNUSED(alignment);
+    return arm_compute::support::cpp14::make_unique<GCBufferMemoryRegion>(size);
 }
 } // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
new file mode 100644
index 0000000..fed4a15
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
+
+#include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+
+namespace arm_compute
+{
+GCMemory::GCMemory()
+    : _region(nullptr), _region_owned(nullptr)
+{
+}
+
+GCMemory::GCMemory(std::shared_ptr<IGCMemoryRegion> memory)
+    : _region(nullptr), _region_owned(std::move(memory))
+{
+    _region_owned = memory;
+    _region       = _region_owned.get();
+}
+
+GCMemory::GCMemory(IGCMemoryRegion *memory)
+    : _region(memory), _region_owned(nullptr)
+{
+    _region = memory;
+}
+
+IGCMemoryRegion *GCMemory::gc_region()
+{
+    return _region;
+}
+
+IGCMemoryRegion *GCMemory::gc_region() const
+{
+    return _region;
+}
+
+IMemoryRegion *GCMemory::region()
+{
+    return _region;
+}
+
+IMemoryRegion *GCMemory::region() const
+{
+    return _region;
+}
+
+void GCMemory::set_region(IMemoryRegion *region)
+{
+    auto gc_region = utils::cast::polymorphic_downcast<IGCMemoryRegion *>(region);
+    _region_owned  = nullptr;
+    _region        = gc_region;
+}
+
+void GCMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+    _region_owned = utils::cast::polymorphic_downcast_unique_ptr<IGCMemoryRegion>(std::move(region));
+    _region       = _region_owned.get();
+}
+} // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
new file mode 100644
index 0000000..45fd6e8
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp

@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+IGCMemoryRegion::IGCMemoryRegion(size_t size)
+    : IMemoryRegion(size), _mapping(nullptr), _ssbo_name(0)
+{
+}
+
+const GLuint &IGCMemoryRegion::gc_ssbo_name() const
+{
+    return _ssbo_name;
+}
+
+void *IGCMemoryRegion::buffer()
+{
+    return _mapping;
+}
+
+void *IGCMemoryRegion::buffer() const
+{
+    return _mapping;
+}
+
+GCBufferMemoryRegion::GCBufferMemoryRegion(size_t size)
+    : IGCMemoryRegion(size)
+{
+    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+}
+
+GCBufferMemoryRegion::~GCBufferMemoryRegion()
+{
+    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name));
+}
+
+void *GCBufferMemoryRegion::ptr()
+{
+    return nullptr;
+}
+
+void *GCBufferMemoryRegion::map(bool blocking)
+{
+    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
+    ARM_COMPUTE_UNUSED(blocking);
+
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+    void *p  = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
+    _mapping = reinterpret_cast<uint8_t *>(p);
+
+    return _mapping;
+}
+
+void GCBufferMemoryRegion::unmap()
+{
+    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
+    ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
+    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    _mapping = nullptr;
+}
+
+std::unique_ptr<IMemoryRegion> GCBufferMemoryRegion::extract_subregion(size_t offset, size_t size)
+{
+    ARM_COMPUTE_UNUSED(offset, size);
+    return nullptr;
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index abd2b48..a0dd540 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp

@@ -26,21 +26,17 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 GCTensorAllocator::GCTensorAllocator(GCTensor *owner)
-    : _associated_memory_group(nullptr), _gl_buffer(), _mapping(nullptr), _owner(owner)
+    : _associated_memory_group(nullptr), _memory(), _mapping(nullptr), _owner(owner)
 {
 }
 
-GCTensorAllocator::~GCTensorAllocator()
-{
-    _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
-}
-
 uint8_t *GCTensorAllocator::data()
 {
     return _mapping;
@@ -50,32 +46,28 @@
 {
     if(_associated_memory_group == nullptr)
     {
-        _gl_buffer = support::cpp14::make_unique<GLBufferWrapper>();
-        ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
-        ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(info().total_size()), nullptr, GL_STATIC_DRAW));
-        ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+        _memory.set_owned_region(support::cpp14::make_unique<GCBufferMemoryRegion>(info().total_size()));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(&_gl_buffer), info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
     }
     info().set_is_resizable(false);
 }
 
 void GCTensorAllocator::free()
 {
-    if(_associated_memory_group == nullptr)
-    {
-        _gl_buffer.reset();
-        info().set_is_resizable(true);
-    }
+    _mapping = nullptr;
+    _memory.set_region(nullptr);
+    info().set_is_resizable(true);
 }
 
 void GCTensorAllocator::set_associated_memory_group(GCMemoryGroup *associated_memory_group)
 {
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_gl_buffer.get() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.gc_region()->gc_ssbo_name() != 0);
+
     _associated_memory_group = associated_memory_group;
 }
 
@@ -91,27 +83,23 @@
 
 GLuint GCTensorAllocator::get_gl_ssbo_name() const
 {
-    return _gl_buffer->_ssbo_name;
+    return (_memory.region() == nullptr) ? static_cast<GLuint>(0) : _memory.gc_region()->gc_ssbo_name();
 }
 
 uint8_t *GCTensorAllocator::map(bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    ARM_COMPUTE_UNUSED(blocking);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
 
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
-    void *p  = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(info().total_size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
-    _mapping = reinterpret_cast<uint8_t *>(p);
-
+    _mapping = reinterpret_cast<uint8_t *>(_memory.gc_region()->map(blocking));
     return _mapping;
 }
 
 void GCTensorAllocator::unmap()
 {
     ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
 
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
+    _memory.gc_region()->unmap();
     _mapping = nullptr;
 }
\ No newline at end of file

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index a7a56b6..c58d184 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -150,6 +150,7 @@
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
 
+    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt);
     _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
     _memory_group.manage(&_input_im2col_reshaped);
@@ -160,6 +161,7 @@
     shape_gemm.set(1, mat_input_rows);
     const DataType gemm_data_type = dt;
 
+    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
     _gemm_output.allocator()->init(info_gemm);
     _memory_group.manage(&_gemm_output);

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 7121654..d9aa50d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -31,11 +31,12 @@
 using namespace arm_compute;
 
 GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler(), _shift_handler()
+    : _kernel(nullptr), _border_handler(), _shift_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
 {
 }
 
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
+                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
     k->configure(input, weights, biases, output, conv_info, depth_multiplier);
@@ -45,6 +46,14 @@
     _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
 
     _shift_handler.configure(input);
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 void GCDepthwiseConvolutionLayer3x3::run()
@@ -54,4 +63,10 @@
     GCScheduler::get().dispatch(_border_handler, false);
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(*_kernel);
+
+    // Run Activation Layer
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
index 5fb971c..19fdc3d 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,9 +37,15 @@
 {
 }
 
-void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
 {
-    _norm_kernel.configure(input, output, mean, sd);
+    _norm_kernel.configure(input, output, mean, std);
+}
+
+Status GCNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                           const ITensorInfo *mean, const ITensorInfo *std)
+{
+    return GCNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }
 
 void GCNormalizePlanarYUVLayer::run()

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 0c8769b..dad42cd 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -34,12 +34,13 @@
 {
 }
 
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis)
 {
-    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(beta, axis);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
+    ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES");
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 54a2bd2..b2edad0 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CPUUtils.h"
 
 namespace arm_compute
@@ -30,6 +31,7 @@
 IScheduler::IScheduler()
     : _cpu_info()
 {
+    get_cpu_configuration(_cpu_info);
     // Work out the best possible number of execution threads
     _num_threads_hint = get_threads_hint();
 }
@@ -43,4 +45,10 @@
 {
     return _num_threads_hint;
 }
+void IScheduler::run_tagged_workloads(std::vector<Workload> &workloads, const char *tag)
+{
+    ARM_COMPUTE_UNUSED(tag);
+    run_workloads(workloads);
+}
+
 } // namespace arm_compute

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index faaff8a..7d928d6 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IAllocator.h"
+#include "arm_compute/runtime/IMemory.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "support/ToolchainSupport.h"
@@ -70,7 +71,7 @@
     _active_elements.insert(std::make_pair(obj, obj));
 }
 
-void ISimpleLifetimeManager::end_lifetime(void *obj, void **handle, size_t size)
+void ISimpleLifetimeManager::end_lifetime(void *obj, IMemory &obj_memory, size_t size)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
 
@@ -80,7 +81,7 @@
 
     // Update object fields and mark object as complete
     Element &el = active_object_it->second;
-    el.handle   = handle;
+    el.handle   = &obj_memory;
     el.size     = size;
     el.status   = true;
 

diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
new file mode 100644
index 0000000..ad00070
--- /dev/null
+++ b/src/runtime/MEMUtils.cpp

@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Error.h"
+#include "support/ToolchainSupport.h"
+
+#ifndef BARE_METAL
+#include <fstream>
+#include <regex>
+#include <sstream>
+#endif // ifndef BARE_METAL
+
+namespace
+{
+void parse_mem_info(size_t &total, size_t &free, size_t &buffer)
+{
+    free   = 0;
+    total  = 0;
+    buffer = 0;
+#ifndef BARE_METAL
+    size_t        memcache = 0;
+    size_t        memfree  = 0;
+    std::ifstream meminfo_f;
+    meminfo_f.open("/proc/meminfo", std::ios::in);
+    if(meminfo_f.is_open())
+    {
+        std::stringstream str_stream;
+        str_stream << meminfo_f.rdbuf();
+        const std::string str = str_stream.str();
+        try
+        {
+            std::smatch match;
+            if(std::regex_search(str, match, std::regex("MemTotal: (.*)kB")) && match.size() > 1)
+            {
+                const std::string result = match.str(1);
+                total                    = std::stoul(result, nullptr, 0);
+            }
+            if(std::regex_search(str, match, std::regex("MemFree: (.*)kB")) && match.size() > 1)
+            {
+                const std::string result = match.str(1);
+                memfree                  = std::stoul(result, nullptr, 0);
+            }
+            if(std::regex_search(str, match, std::regex("Buffers: (.*)kB")) && match.size() > 1)
+            {
+                const std::string result = match.str(1);
+                buffer                   = std::stoul(result, nullptr, 0);
+            }
+            if(std::regex_search(str, match, std::regex("Cached: (.*)kB")) && match.size() > 1)
+            {
+                const std::string result = match.str(1);
+                memcache                 = std::stoul(result, nullptr, 0);
+            }
+            free = memfree + (buffer + memcache);
+        }
+        catch(std::regex_error &e)
+        {
+            // failed parsing /proc/meminfo
+            // return 0s on all fields
+        }
+    }
+#endif // ifndef BARE_METAL
+}
+
+} // namespace
+
+namespace arm_compute
+{
+void MEMInfo::set_policy(MemoryPolicy policy)
+{
+    _policy = policy;
+}
+
+MemoryPolicy MEMInfo::get_policy()
+{
+    return _policy;
+}
+MemoryPolicy MEMInfo::_policy = { MemoryPolicy::NORMAL };
+
+MEMInfo::MEMInfo()
+    : _total(0), _free(0), _buffer(0)
+{
+    parse_mem_info(_total, _free, _buffer);
+}
+
+size_t MEMInfo::get_total_in_kb() const
+{
+    return _total;
+}
+
+} // namespace arm_compute

diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index 15bbb17..d116624 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp

@@ -30,17 +30,13 @@
 Memory::Memory()
     : _region(nullptr), _region_owned(nullptr)
 {
-    create_empty_region();
 }
 
 Memory::Memory(std::shared_ptr<IMemoryRegion> memory)
     : _region(nullptr), _region_owned(std::move(memory))
 {
-    if(_region_owned == nullptr)
-    {
-        create_empty_region();
-    }
-    _region = _region_owned.get();
+    _region_owned = memory;
+    _region       = _region_owned.get();
 }
 
 Memory::Memory(IMemoryRegion *memory)
@@ -59,9 +55,15 @@
     return _region;
 }
 
-void Memory::create_empty_region()
+void Memory::set_region(IMemoryRegion *region)
 {
-    _region_owned = std::make_shared<MemoryRegion>(0);
+    _region_owned = nullptr;
+    _region       = region;
+}
+
+void Memory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
+{
+    _region_owned = std::move(region);
     _region       = _region_owned.get();
 }
 } // namespace arm_compute

diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 4dfa28b..d9803a8 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,33 +29,15 @@
 
 #include <memory>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
-    : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager)), _allocator(nullptr), _is_finalized(false), _num_pools(1)
+    : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
     ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
 }
 
-bool MemoryManagerOnDemand::is_finalized() const
-{
-    return _is_finalized;
-}
-
-void MemoryManagerOnDemand::set_num_pools(unsigned int num_pools)
-{
-    ARM_COMPUTE_ERROR_ON(num_pools == 0);
-    _num_pools = num_pools;
-}
-
-void MemoryManagerOnDemand::set_allocator(IAllocator *allocator)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
-    ARM_COMPUTE_ERROR_ON(allocator == nullptr);
-    _allocator = allocator;
-}
-
 ILifetimeManager *MemoryManagerOnDemand::lifetime_manager()
 {
     return _lifetime_mgr.get();
@@ -66,23 +48,26 @@
     return _pool_mgr.get();
 }
 
-void MemoryManagerOnDemand::finalize()
+void MemoryManagerOnDemand::populate(arm_compute::IAllocator &allocator, size_t num_pools)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(is_finalized(), "Memory manager is already finalized!");
     ARM_COMPUTE_ERROR_ON(!_lifetime_mgr);
     ARM_COMPUTE_ERROR_ON(!_pool_mgr);
-    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized! ");
-    ARM_COMPUTE_ERROR_ON(_allocator == nullptr);
+    ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr->are_all_finalized(), "All the objects have not been finalized!");
+    ARM_COMPUTE_ERROR_ON_MSG(_pool_mgr->num_pools() != 0, "Pool manager already contains pools!");
 
     // Create pools
-    auto pool_template = _lifetime_mgr->create_pool(_allocator);
-    for(int i = _num_pools; i > 1; --i)
+    auto pool_template = _lifetime_mgr->create_pool(&allocator);
+    for(int i = num_pools; i > 1; --i)
     {
         auto pool = pool_template->duplicate();
         _pool_mgr->register_pool(std::move(pool));
     }
     _pool_mgr->register_pool(std::move(pool_template));
-
-    // Set finalized to true
-    _is_finalized = true;
 }
+
+void MemoryManagerOnDemand::clear()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_pool_mgr, "Pool manager not specified correctly!");
+    _pool_mgr->clear_pools();
+}
+} //namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 7d8e3cf..677e9f6 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -29,8 +29,8 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
@@ -51,3 +51,4 @@
 {
     return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 5c0491e..ceb4b49 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,33 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
 
-using namespace arm_compute;
-
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+namespace arm_compute
+{
+void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
+
+    if(output->info()->dimension(0) > 1)
+    {
+        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+
+        if(broadcasted_info->info()->dimension(0) == 1)
+        {
+            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+        }
+    }
 }
+
 Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
     return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index d72c98b..0e5d50f 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp

@@ -58,8 +58,7 @@
 {
 }
 
-void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value,
-                            bool use_fp16)
+void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -127,18 +126,9 @@
     _memory_group.manage(&_phase);
 
     // Configure gradient
-    if(use_fp16)
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEGradientFP16Kernel>();
-        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
-        _gradient = std::move(k);
-    }
-    else
-    {
-        auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
-        k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
-        _gradient = std::move(k);
-    }
+    auto k = arm_compute::support::cpp14::make_unique<NEGradientKernel>();
+    k->configure(&_gx, &_gy, &_magnitude, &_phase, norm_type);
+    _gradient = std::move(k);
 
     // Allocate intermediate tensors
     _gx.allocator()->allocate();

diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
new file mode 100644
index 0000000..485abfe
--- /dev/null
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEChannelShuffleLayerKernel>();
+    k->configure(input, output, num_groups);
+    _kernel = std::move(k);
+}
+
+Status NEChannelShuffleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+{
+    return NEChannelShuffleLayerKernel::validate(input, output, num_groups);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index fda9f57..6887a0a 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
@@ -35,7 +36,10 @@
     : _memory_group(std::move(memory_manager)),
       _conv_f(),
       _upsample_f(),
+      _flip_weights(),
       _scaled_output(),
+      _weights_flipped(),
+      _original_weights(nullptr),
       _input(nullptr),
       _info(),
       _inner_border(),
@@ -60,9 +64,9 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
 
     auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
-                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     if(bias != nullptr)
     {
@@ -73,15 +77,17 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+        const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
     }
 
-    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
-                                                                                                      info)));
+    unsigned int        padx            = 0;
+    unsigned int        pady            = 0;
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
@@ -99,33 +105,45 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    _input        = input;
-    _info         = info;
-    _inner_border = std::make_pair(inner_border_right, inner_border_top);
-    _is_prepared  = false;
+    _input            = input;
+    _original_weights = weights;
+    _info             = info;
+    _inner_border     = std::make_pair(inner_border_right, inner_border_top);
+    _is_prepared      = false;
 
     const unsigned int stride_x = info.stride().first;
     const unsigned int stride_y = info.stride().second;
 
+    _weights_flipped.allocator()->init(TensorInfo(weights->info()->tensor_shape(), 1, weights->info()->data_type()));
+    _flip_weights.configure(weights, &_weights_flipped);
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                    info.pad().first, info.pad().second, stride_x, stride_y);
+
+    const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
 
     _memory_group.manage(&_scaled_output);
 
-    // configure scale function
-    // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
-    const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type());
+    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
+    unsigned int      padx            = 0;
+    unsigned int      pady            = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, inner_border_right, inner_border_top, out_dims, padx, pady);
+
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
     _scaled_output.allocator()->init(scale_out_info);
 
+    const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
+    _upsample_f.configure(input, &_scaled_output, upsample_info, inner_border_right, inner_border_top);
+
     // setup the function to convolve the upscaled output
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
-
-    // Allocate auxiliary tensors
+    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
     _scaled_output.allocator()->allocate();
-
-    // configure upsample function
-    _upsample_f.configure(input, &_scaled_output, info, inner_border_right, inner_border_top);
 }
 
 void NEDeconvolutionLayer::run()
@@ -144,7 +162,21 @@
 {
     if(!_is_prepared)
     {
+        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+        // Run weights flipping and mark original weights tensor as unused
+        _weights_flipped.allocator()->allocate();
+        NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+        _original_weights->mark_as_unused();
+
+        // Prepare convolution
         _conv_f.prepare();
+
+        if(!_weights_flipped.is_used())
+        {
+            _weights_flipped.allocator()->free();
+        }
+
         _is_prepared = true;
     }
-}
\ No newline at end of file
+}

diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 9a75404..0041c1f 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,9 +30,14 @@
 
 using namespace arm_compute;
 
-void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
+void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertLayerKernel>();
     k->configure(input, output, policy, shift);
     _kernel = std::move(k);
 }
+
+Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+{
+    return NEDepthConvertLayerKernel::validate(input, output, policy, shift);
+}

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 24b12f4..a2f0094 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -36,14 +36,16 @@
 using namespace arm_compute::misc::shape_calculator;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(),
-      _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false)
+    : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _activationlayer_function(), _accumulator(), _permuted_input(),
+      _permuted_weights(), _permuted_output(), _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false),
+      _is_activationlayer_enabled(false)
 {
 }
 
-void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
     PixelValue zero_value(0.f);
@@ -59,8 +61,25 @@
     _is_nchw              = input->info()->data_layout() == DataLayout::NCHW;
     _permute              = _is_optimized == _is_nchw;
 
+    // Initialize the intermediate accumulator tensor in case of quantized input
+    if(_is_quantized)
+    {
+        TensorShape accum_shape  = output->info()->tensor_shape();
+        DataLayout  accum_layout = output->info()->data_layout();
+        if(!_is_optimized && !_is_nchw)
+        {
+            permute(accum_shape, PermutationVector(1U, 2U, 0U));
+            accum_layout = DataLayout::NCHW;
+        }
+
+        _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+        _accumulator.info()->set_data_layout(accum_layout);
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+    }
+
     if(_is_optimized)
     {
+        ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
         if(_is_nchw)
         {
             // Configure the function to transform the input tensor from NCHW -> NHWC
@@ -75,8 +94,8 @@
             _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
 
             // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
-            _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
-            _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+            _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+            _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
 
             // Allocate tensors
             _permuted_input.allocator()->allocate();
@@ -85,26 +104,11 @@
         }
         else
         {
-            _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+            _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
         }
     }
     else
     {
-        // Allocate the intermediate accumulator tensor in case of quantized input
-        if(_is_quantized)
-        {
-            TensorShape accum_shape = output->info()->tensor_shape();
-
-            if(!_is_nchw)
-            {
-                permute(accum_shape, PermutationVector(1U, 2U, 0U));
-            }
-
-            _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
-            _accumulator.info()->set_quantization_info(input->info()->quantization_info());
-            zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
-        }
-
         if(!_is_nchw)
         {
             // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -143,7 +147,7 @@
         float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
-        _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+        _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
         _accumulator.allocator()->allocate();
     }
     else if(_has_bias)
@@ -157,21 +161,46 @@
         _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
         _permuted_output.allocator()->allocate();
     }
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                unsigned int depth_multiplier)
+                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
 
     if(biases != nullptr)
     {
+        const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
 
-    return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
+    const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    TensorInfo accumulator  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, is_quantized ? &accumulator : output, conv_info, depth_multiplier));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, biases, output));
+    }
+
+    //Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+    }
+
+    return Status{};
 }
 
 void NEDepthwiseConvolutionLayer3x3::run()
@@ -222,16 +251,22 @@
     {
         _permute_output.run();
     }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
     : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
-      _permute_weights(), _permute_output(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
-      _is_quantized(false), _is_nhwc(false), _original_weights(nullptr)
+      _permute_weights(), _permute_output(), _activationlayer_function(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(),
+      _permuted_output(), _is_prepared(false), _is_quantized(false), _is_nhwc(false), _is_activationlayer_enabled(false), _original_weights(nullptr)
 {
 }
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+                                            unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_UNUSED(channel_idx);
@@ -353,13 +388,24 @@
     // Allocate intermediate tensors
     _input_reshaped.allocator()->allocate();
     _v2mm_output.allocator()->allocate();
+
+    //Configure Activation Layer
+    _is_activationlayer_enabled = act_info.enabled();
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.configure(output, nullptr, act_info);
+    }
 }
 
 Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier)
+                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Clone output to use auto init
     auto output_clone = output->clone();
@@ -391,8 +437,8 @@
     const size_t       weights_w    = weights_to_use->dimension(0);
     const size_t       weights_h    = weights_to_use->dimension(1);
     const size_t       weights_z    = weights_to_use->dimension(2);
-    const unsigned int conv_w       = output_shape.x();
-    const unsigned int conv_h       = output_shape.y();
+    const unsigned int conv_w       = output_shape[width_idx];
+    const unsigned int conv_h       = output_shape[height_idx];
     const size_t       patch_size   = weights_w * weights_h + (append_bias ? 1 : 0);
     const size_t       conv_size    = conv_w * conv_h;
 
@@ -438,6 +484,12 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use));
     }
 
+    // Validate Activation Layer
+    if(act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
+    }
+
     return Status{};
 }
 
@@ -463,6 +515,11 @@
     {
         _permute_output.run();
     }
+
+    if(_is_activationlayer_enabled)
+    {
+        _activationlayer_function.run();
+    }
 }
 
 void NEDepthwiseConvolutionLayer::prepare()

diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 1814d61..57bef2b 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp

@@ -23,7 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "arm_compute/core/Size2D.h"
 #include "support/ToolchainSupport.h"
 
@@ -31,7 +31,12 @@
 
 void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, false, true);
+    auto k = arm_compute::support::cpp14::make_unique<NEFlattenLayerKernel>();
+    k->configure(input, output);
     _kernel = std::move(k);
+}
+
+Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEFlattenLayerKernel::validate(input, output);
 }
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 0000cdd..8179188 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,17 @@
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
 #include "support/ToolchainSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEFloor::configure(const ITensor *input, ITensor *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEFloorKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status NEFloor::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return NEFloorKernel::validate(input, output);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index f1606aa..45e21b5 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -50,6 +50,7 @@
         // Validate gemmlowp function
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
                                                                            &weights.clone()->set_quantization_info(weights_quantization_info),
+                                                                           nullptr,
                                                                            &output));
     }
     else
@@ -74,8 +75,8 @@
 }
 
 NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _im2col_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
-      _im2col_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
+    : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), _accumulate_biases_kernel(),
+      _flatten_output(), _gemmlowp_output(), _converted_weights_output(), _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false),
       _is_fc_after_conv(false), _accumulate_biases(false), _is_quantized(false), _is_prepared(false)
 {
 }
@@ -93,7 +94,7 @@
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
         // Configure gemmlowp function
-        _mm_gemmlowp.configure(input, weights, output);
+        _mm_gemmlowp.configure(input, weights, nullptr, output);
 
         // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -112,19 +113,19 @@
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_flatten_shape(input->info());
-    _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    // Initialize output tensor for flatten
+    TensorShape shape_flatten = compute_flatten_shape(input->info());
+    _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
-    // Configure im2col kernel
-    _memory_group.manage(&_im2col_output);
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true);
+    // Configure flatten kernel
+    _memory_group.manage(&_flatten_output);
+    _flatten_kernel.configure(input, &_flatten_output);
 
     // Configure matrix multiply kernel
-    configure_mm(&_im2col_output, weights, output);
+    configure_mm(&_flatten_output, weights, output);
 
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
+    // Allocate the output tensor for flatten once all the configure methods have been called
+    _flatten_output.allocator()->allocate();
 }
 
 void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output)
@@ -249,7 +250,7 @@
     bool is_fc_after_conv = true;
     bool is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
 
-    const ITensorInfo &im2col_input      = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
+    const ITensorInfo &flatten_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
     const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
     const ITensorInfo &gemmlowp_output   = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
@@ -307,9 +308,9 @@
         // Fully Connected layer after a Convolution Layer without batches
         ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
-        // Validate im2col kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, Size2D(1U, 1U), 1, true));
-        input_to_use = &im2col_input;
+        // Validate flatten kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+        input_to_use = &flatten_input;
     }
     else
     {
@@ -337,7 +338,7 @@
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
-        NEScheduler::get().schedule(&_im2col_kernel, Window::DimY);
+        NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
     }
 
     // Run matrix multiply

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index de51266..72a3e80 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -62,7 +62,14 @@
 
     if(run_optimised)
     {
-        _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+        if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
+        {
+            _asm_glue.configure(a, b, d, alpha, beta, false);
+        }
+        else
+        {
+            _asm_glue.configure(a, b, d, alpha, beta, _reshape_b_only_on_first_run);
+        }
         ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
     }
     else
@@ -132,7 +139,7 @@
     ARM_COMPUTE_UNUSED(alpha);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -140,7 +147,7 @@
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
@@ -150,7 +157,7 @@
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 1)
+        if(gemm_info.depth_output_gemm3d() != 0)
         {
             if(gemm_info.reinterpret_input_as_3d())
             {
@@ -174,7 +181,7 @@
     if(!run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
 
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -218,6 +225,12 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
     }
 
+    // Validate matrix addition kernel
+    if(beta != 0 && c != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
     return Status{};
 }
 

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 29db654..922f757 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -45,6 +45,7 @@
     //Note: It's safe to not check for FP16 support because this was already checked in NEGEMMAssemblyDispatch::configure()
     switch(method)
     {
+        case arm_gemm::GemmMethod::GEMM_INTERLEAVED_FP16:
         case arm_gemm::GemmMethod::GEMM_INTERLEAVED:
         {
             if(!pretranspose_hint)
@@ -227,7 +228,7 @@
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment) }, 1, DataType::S8), alignment);
+        _pretranspose.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
         _pretranspose.allocator()->allocate();
         ARM_COMPUTE_ERROR_ON_NULLPTR(_pretranspose.buffer());
     }
@@ -258,7 +259,7 @@
 void Fallback<TypeInput, TypeOutput>::allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment)
 {
     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0");
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment) }, 1, DataType::S8), alignment);
+    _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
     memory_group.manage(&_workspace);
     _workspace.allocator()->allocate();
 }

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 92e641e..0232a83 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -32,6 +32,7 @@
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
+#include <set>
 #include <tuple>
 
 using namespace arm_compute;
@@ -100,6 +101,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info(), gemm_3d_depth, _skip_im2col));
 
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
+
     if(_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -110,7 +114,7 @@
         input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
         weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
-        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+        _mm_gemmlowp.configure(input, weights, nullptr, output, gemm_info);
 
         // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
         input->info()->set_quantization_info(input_quantization_info);
@@ -119,8 +123,7 @@
     else
     {
         // Configure matrix multiply function
-        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, gemm_3d_depth,
-                                                                                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
     }
 }
 
@@ -128,7 +131,8 @@
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col);
+    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+                                         gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */);
     if(is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -142,7 +146,7 @@
         weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
         // Perform validation step on GEMMLowp
-        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), output, gemm_info);
+        return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), nullptr, output, gemm_info);
     }
     else
     {
@@ -185,19 +189,18 @@
     const DataLayout data_layout = input->info()->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
 
-    _is_prepared      = weights_info.retain_internal_weights();
-    _original_weights = weights;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
-    _data_layout      = data_layout;
-    _skip_im2col      = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    _skip_col2im      = data_layout == DataLayout::NHWC;
-    _append_bias      = (biases != nullptr) && (!_is_quantized);
+    _is_prepared                = weights_info.retain_internal_weights();
+    _original_weights           = weights;
+    _is_quantized               = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _data_layout                = data_layout;
+    _skip_im2col                = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _append_bias                = (biases != nullptr) && (!_is_quantized);
+    _is_activationlayer_enabled = act_info.enabled();
 
     const ITensor *gemm_input_to_use         = input;
     ITensor       *gemm_output_to_use        = output;
@@ -214,17 +217,20 @@
                                                  dilation);
 
     // Check if GEMM3D is supported
-    if(_skip_col2im)
+    if(data_layout == DataLayout::NHWC)
     {
+        _skip_col2im = bool(validate_gemm3d(input->info()->data_type(), conv_h, true));
         // If not supported, we need to perform im2col and col2im (or reshape layer)
-        if(!bool(validate_gemm3d(input->info()->data_type(), conv_h, _skip_im2col)))
+        if(!_skip_col2im)
         {
             _skip_im2col = false;
-            _skip_col2im = false;
         }
     }
+    else
+    {
+        _skip_col2im = false;
+    }
 
-    const unsigned bias_element  = (_append_bias && !_skip_im2col) ? 1 : 0;
     const ITensor *biases_to_use = (_append_bias && !_skip_im2col) ? biases : nullptr;
 
     // Get parameters from conv_info
@@ -233,7 +239,6 @@
     std::tie(stride_x, stride_y) = conv_info.stride();
 
     unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->info()->dimension(idx_width) * weights->info()->dimension(idx_height) * weights->info()->dimension(idx_channel) + bias_element;
 
     // _weights_reshaped will be auto configured in the kernel.
     // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
@@ -242,14 +247,6 @@
     // Create tensor to store im2col reshaped inputs
     if(!_skip_im2col)
     {
-        // Calculate im2col shape
-        // For NEON the batch size is on the fourth dimension
-        TensorShape shape_im2col = input->info()->tensor_shape();
-        shape_im2col.set(0, mat_weights_rows);
-        shape_im2col.set(1, conv_w * conv_h);
-        shape_im2col.set(2, 1);
-
-        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
         _memory_group.manage(&_im2col_output);
 
         // Configure
@@ -265,17 +262,27 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
-    if(!_skip_col2im)
+    if(!_skip_col2im || _is_quantized)
     {
-        // Calculate GEMM output shape
-        TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
-        shape_gemm.set(0, mat_weights_cols);
-        shape_gemm.set(1, conv_w * conv_h);
-
         // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
         const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type;
+        TensorShape    shape_gemm;
+
+        if(_is_quantized && _skip_col2im)
+        {
+            shape_gemm = output->info()->tensor_shape();
+        }
+        else
+        {
+            // Calculate GEMM output shape
+            shape_gemm = _im2col_output.info()->tensor_shape();
+            shape_gemm.set(0, mat_weights_cols);
+            shape_gemm.set(1, conv_w * conv_h);
+        }
+
+        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
         TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
-        info_gemm.set_quantization_info(output->info()->quantization_info());
+        info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
         _memory_group.manage(&_gemm_output);
 
@@ -284,7 +291,9 @@
     }
 
     // Configure GEMM
-    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, _skip_col2im ? conv_h : 1);
+    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
@@ -294,16 +303,39 @@
     // Configure output stage for quantized case
     if(_is_quantized)
     {
-        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+        const QuantizationInfo input_quant_info  = input->info()->quantization_info();
+        const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input_quant_info : output->info()->quantization_info();
 
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+        float multiplier = input_quant_info.scale * weights->info()->quantization_info().scale / output_quant_info.scale;
         int   output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
-        _memory_group.manage(&_tmp_output);
-        gemm_output_staged_to_use = &_tmp_output;
+        if(!_skip_col2im)
+        {
+            _memory_group.manage(&_tmp_output);
+            gemm_output_staged_to_use = &_tmp_output;
+        }
 
-        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset);
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+        if(_is_activationlayer_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            _is_activationlayer_enabled = false;
+        }
+
+        _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, min_activation, max_activation);
     }
 
     if(!_skip_col2im)
@@ -320,12 +352,12 @@
         }
     }
 
-    if(_is_quantized)
+    if(_is_quantized && !_skip_col2im)
     {
         _tmp_output.allocator()->allocate();
     }
 
-    if(!_skip_col2im)
+    if(!_skip_col2im || _is_quantized)
     {
         _gemm_output.allocator()->allocate();
     }
@@ -333,9 +365,7 @@
     ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
+    // Configure Activation Layer
     if(_is_activationlayer_enabled)
     {
         _activationlayer_function.configure(output, nullptr, act_info);
@@ -370,10 +400,10 @@
     const ITensorInfo *gemm_output_staged_to_use = output;
     const ITensorInfo *weights_to_use            = weights;
 
-    const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
-    const bool append_bias  = (biases != nullptr) && (!is_quantized);
-    bool       skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-    bool       skip_col2im  = data_layout == DataLayout::NHWC;
+    const bool is_quantized          = is_data_type_quantized_asymmetric(data_type);
+    const bool append_bias           = (biases != nullptr) && (!is_quantized);
+    bool       skip_im2col           = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    bool       is_activation_enabled = act_info.enabled();
 
     // Get convolved dimensions
     unsigned int conv_w = 0;
@@ -387,6 +417,17 @@
                                                  dilation);
 
     // Check if GEMM3D is supported
+    bool skip_col2im = false;
+    if(data_layout == DataLayout::NHWC)
+    {
+        skip_col2im = bool(validate_gemm3d(input->data_type(), conv_h, true));
+        // If not supported, we need to perform im2col and col2im (or reshape layer)
+        if(!skip_col2im)
+        {
+            skip_im2col = false;
+        }
+    }
+
     if(skip_col2im)
     {
         // If not supported, we need to perform im2col and col2im (or reshape layer)
@@ -435,6 +476,7 @@
     {
         // Create tensor info for im2col reshaped inputs
         // For NEON the batch size is on the fourth dimension
+        // TODO (giaiod01): Auto-initialize the output shape of im2col COMPMID-1482
         TensorShape shape_im2col = input->tensor_shape();
         shape_im2col.set(0, mat_weights_rows);
         shape_im2col.set(1, conv_w * conv_h);
@@ -453,33 +495,60 @@
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
+    const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
     if(!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
-        const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type;
-        // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
         info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type);
-        info_gemm.set_quantization_info(output->quantization_info());
-
-        gemm_output_to_use = &info_gemm;
     }
+    else
+    {
+        info_gemm = TensorInfo(output->tensor_shape(), 1, gemm_data_type);
+    }
+    info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
+    gemm_output_to_use = &info_gemm;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 1, skip_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
 
     if(is_quantized)
     {
-        float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale;
-        int   output_multiplier, output_shift;
+        const QuantizationInfo input_quant_info  = input->quantization_info();
+        const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input_quant_info : output->quantization_info();
+        const float            multiplier        = input_quant_info.scale * weights_to_use->quantization_info().scale / output_quant_info.scale;
+        int                    output_multiplier, output_shift;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
-        tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
-        tmp_info.set_quantization_info(output->quantization_info());
-        gemm_output_staged_to_use = &tmp_info;
+        if(!skip_col2im)
+        {
+            tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8);
+            tmp_info.set_quantization_info(output->quantization_info()).set_data_layout(data_layout);
+            gemm_output_staged_to_use = &tmp_info;
+        }
+
+        // Merge activation with output stage
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                                 };
+
+        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        {
+            const int a_const_int = output_quant_info.quantize(act_info.a(), RoundingPolicy::TO_NEAREST_UP);
+            const int b_const_int = output_quant_info.quantize(act_info.b(), RoundingPolicy::TO_NEAREST_UP);
+
+            min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? output_quant_info.offset : b_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+
+            is_activation_enabled = false;
+        }
 
         // Validate output stage for quantized case
-        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset);
+        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min_activation, max_activation);
     }
 
     // Validate Col2Im/ReshapeLayer
@@ -491,7 +560,7 @@
     }
 
     //Validate Activation Layer
-    if(act_info.enabled())
+    if(is_activation_enabled)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
     }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 828011d..4b02694 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -47,10 +47,11 @@
 {
 }
 
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+    ARM_COMPUTE_UNUSED(c);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
 
     // Clear state
     _mtx_a_reshape_kernel = nullptr;
@@ -181,49 +182,76 @@
     }
 }
 
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
-                                    "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
-                                    "The output matrix must have the same number of columns as the matrix B");
-    ARM_COMPUTE_UNUSED(gemm_info);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
 
-    int32_t a_offset                         = a->quantization_info().offset;
-    int32_t b_offset                         = b->quantization_info().offset;
-    bool    run_vector_matrix_multiplication = a->dimension(1) < 2;
+    int32_t    a_offset                    = a->quantization_info().offset;
+    int32_t    b_offset                    = b->quantization_info().offset;
+    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
-    if(!run_vector_matrix_multiplication)
+    // Check if we need to run the optimized assembly kernel
+    const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, output, 1.f, 0.f, reshape_b_only_on_first_run));
+
+    if(run_optimised)
     {
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->tensor_shape();
-        shape_tmp_a.set(0, a->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->tensor_shape();
-        shape_tmp_b.set(0, b->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
-
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
+            if(gemm_info.depth_output_gemm3d() != 0)
+            {
+                if(gemm_info.reinterpret_input_as_3d())
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
+                }
+                else
+                {
+                    ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
+                }
+            }
+            else
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
+            }
+        }
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+
+        const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
+        if(!run_vector_matrix_multiplication)
+        {
+            // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+            TensorShape shape_tmp_a = a->tensor_shape();
+            shape_tmp_a.set(0, a->dimension(0) * 4);
+            shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+            // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+            TensorShape shape_tmp_b = b->tensor_shape();
+            shape_tmp_b.set(0, b->dimension(1) * 16);
+            shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+            TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+            TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+        }
     }
 
     TensorInfo info_vector_sum_col, info_vector_sum_row;

diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 8c02436..ce69fa0 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 25e28d2..db5e926 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
 
 void NEHarrisCorners::configure(IImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
+                                BorderMode border_mode, uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -126,62 +126,31 @@
     // Manage intermediate buffers
     _memory_group.manage(&_score);
 
-    if(use_fp16)
+    // Set/init Harris Score kernel accordingly with block_size
+    switch(block_size)
     {
-        switch(block_size)
+        case 3:
         {
-            case 3:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<3>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 5:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<5>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 7:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreFP16Kernel<7>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            default:
-                break;
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
         }
-    }
-    else
-    {
-        // Set/init Harris Score kernel accordingly with block_size
-        switch(block_size)
+        break;
+        case 5:
         {
-            case 3:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<3>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 5:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            break;
-            case 7:
-            {
-                auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
-                k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
-                _harris_score = std::move(k);
-            }
-            default:
-                break;
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<5>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
         }
+        break;
+        case 7:
+        {
+            auto k = arm_compute::support::cpp14::make_unique<NEHarrisScoreKernel<7>>();
+            k->configure(&_gx, &_gy, &_score, norm_factor, threshold, sensitivity, border_mode == BorderMode::UNDEFINED);
+            _harris_score = std::move(k);
+        }
+        default:
+            break;
     }
 
     // Configure border filling before harris score

diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 4245b65..9102fca 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp

@@ -34,18 +34,17 @@
 {
 }
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups,
-                         bool is_fully_connected, bool is_flatten)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
-    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
 }
 
 Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups, bool is_fully_connected, bool is_flatten)
+                          unsigned int num_groups)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups, is_fully_connected, is_flatten);
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups);
 }
 
 void NEIm2Col::run()

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
new file mode 100644
index 0000000..7c7580a
--- /dev/null
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _gemm_input_gate(), _transpose_input_gate(), _accum_input_gate1(), _accum_input_gate2(), _subtract_input_gate(),
+      _pixelwise_mul_input_gate(), _activation_input_gate(), _fully_connected_forget_gate(), _gemm_forget_gate(), _transpose_forget_gate(), _accum_forget_gate1(), _accum_forget_gate2(),
+      _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
+      _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
+      _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(),
+      _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(),
+      _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false),
+      _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false)
+{
+}
+
+void NELSTMLayer::configure(const ITensor *input,
+                            const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
+                            const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
+                            const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
+                            const ITensor *output_state_in, const ITensor *cell_state_in,
+                            ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
+                            const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
+                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                 recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                 forget_gate_bias, cell_bias, output_gate_bias,
+                                 output_state_in, cell_state_in,
+                                 scratch_buffer, output_state_out, cell_state_out, output);
+
+    // Set lstm parameters
+    LSTMParams<ITensorInfo> lstm_params_info;
+    if(lstm_params.has_peephole_opt())
+    {
+        lstm_params_info.set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+    }
+    if(lstm_params.has_projection())
+    {
+        lstm_params_info.set_projection_params(lstm_params.projection_weights()->info(),
+                                               lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+    }
+    if(!lstm_params.has_cifg_opt())
+    {
+        const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        lstm_params_info.set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
+                                         cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+    }
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
+                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
+                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+                                                     output_state_in->info(), cell_state_in->info(),
+                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+
+    const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
+
+    // Configure block that calculates the forget gate
+    // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias)
+    TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type()));
+    _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    _memory_group.manage(&_forget_gate_out1);
+    _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1);
+    _memory_group.manage(&_forget_gate_out2);
+    _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2);
+    _memory_group.manage(&_forget_gate_out3);
+    _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f);
+    _forget_gate_out2.allocator()->allocate();
+    _memory_group.manage(&_forget_gate_out5);
+    _accum_forget_gate1.configure(&_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE);
+    Tensor *forget_gate_out = &_forget_gate_out5;
+
+    if(lstm_params.has_peephole_opt())
+    {
+        _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        _run_peephole_opt = true;
+        _memory_group.manage(&_forget_gate_out4);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_forget_gate2.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _forget_gate_out4.allocator()->allocate();
+        _forget_gate_out5.allocator()->allocate();
+        forget_gate_out = &_forget_gate_out3;
+    }
+    else
+    {
+        _forget_gate_out3.allocator()->allocate();
+    }
+    _activation_forget_gate.configure(forget_gate_out, &_forget_gate_out1, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    forget_gate_out->allocator()->allocate();
+
+    // Configure block that calculates the input gate
+    // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
+    // input_gate = 1 - forget_gate, with CIFG
+    _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    if(lstm_params.has_cifg_opt())
+    {
+        _memory_group.manage(&_input_gate_out1);
+        _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _subtract_input_gate.configure(&_ones, &_forget_gate_out1, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _ones.allocator()->allocate();
+        _run_cifg_opt = true;
+    }
+    else
+    {
+        TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+
+        _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type()));
+        _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+        _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+        _memory_group.manage(&_input_gate_out1);
+        _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1);
+        _memory_group.manage(&_input_gate_out2);
+        _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _memory_group.manage(&_input_gate_out3);
+        _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f);
+        _input_gate_out2.allocator()->allocate();
+        _memory_group.manage(&_input_gate_out4);
+        _accum_input_gate1.configure(&_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE);
+        if(_run_peephole_opt)
+        {
+            _memory_group.manage(&_input_gate_out5);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _input_gate_out5.allocator()->allocate();
+        }
+        _input_gate_out3.allocator()->allocate();
+        _input_gate_out4.allocator()->allocate();
+        _activation_input_gate.configure(&_input_gate_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    }
+
+    // Configure block that calculates the cell state
+    // cell_state = Clip((PixelwiseMul(input_gate, Activation(input * input_to_cell_weights + output_state_in * recurrent_to_cell_weights + cell_bias)) + PixelwiseMul(forget_gate, cell_state)), cell_threshold)
+    TensorShape cell_state1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    _cell_state_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out2.allocator()->init(TensorInfo(cell_state1_shape, 1, input->info()->data_type()));
+    _cell_state_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    _memory_group.manage(&_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, cell_bias, &_cell_state_out1);
+    _memory_group.manage(&_cell_state_out2);
+    _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
+    _memory_group.manage(&_cell_state_out3);
+    _gemm_cell_state1.configure(output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _cell_state_out2.allocator()->allocate();
+    _memory_group.manage(&_cell_state_out4);
+    _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _activation_cell_state.configure(&_cell_state_out4, nullptr, activation_info);
+    _memory_group.manage(&_cell_state_out5);
+    _pixelwise_mul_cell_state1.configure(&_cell_state_out4, &_input_gate_out1, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _input_gate_out1.allocator()->allocate();
+    _cell_state_out4.allocator()->allocate();
+    _pixelwise_mul_cell_state2.configure(&_forget_gate_out1, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _forget_gate_out1.allocator()->allocate();
+    _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _cell_state_out3.allocator()->allocate();
+    _cell_state_out5.allocator()->allocate();
+    // Perform clipping
+    if(cell_threshold != 0.f)
+    {
+        _perform_cell_clipping = true;
+        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+    }
+
+    // Configure block that calculates the output
+    // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias)
+    TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info());
+    _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type()));
+    _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    _memory_group.manage(&_output1);
+    _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1);
+    _memory_group.manage(&_output2);
+    _transpose_output.configure(recurrent_to_output_weights, &_output2);
+    _memory_group.manage(&_output3);
+    _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f);
+    _output2.allocator()->allocate();
+    _memory_group.manage(&_output5);
+    _accum_output1.configure(&_output1, &_output3, &_output5, ConvertPolicy::SATURATE);
+    _output3.allocator()->allocate();
+    Tensor *output_gate_out = &_output5;
+    if(lstm_params.has_peephole_opt())
+    {
+        _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
+
+        _memory_group.manage(&_output4);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE);
+        _output5.allocator()->allocate();
+        output_gate_out = &_output1;
+
+        // Allocate intermediate buffers
+        _output4.allocator()->allocate();
+    }
+    else
+    {
+        _output1.allocator()->allocate();
+    }
+    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    output_gate_out->allocator()->allocate();
+
+    // Configure block that calculates the output state
+    /** lstm_res = PixelwiseMul(output, Activation(cell_state))
+     *
+     *                      -- Clip(lstm_res * projection_weights + projection_bias, projection_threshold) , if there is a projection
+     *                     /
+     *  output_state =  --
+     *                     \
+     *                      -- lstm_res , otherwise
+     */
+    ITensor *output_state_out_tmp = lstm_params.has_projection() ? &_output_state1 : output_state_out;
+    _cell_state_activation.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+    _output_state1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+
+    _memory_group.manage(&_cell_state_activation);
+    _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_activation.allocator()->allocate();
+
+    if(lstm_params.has_projection())
+    {
+        _has_projection_weights = true;
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _output_state1.allocator()->allocate();
+        // Perform clipping
+        if(projection_threshold != 0.f)
+        {
+            _perform_projection_clipping = true;
+            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+        }
+    }
+
+    // Copy cell state and output
+    _copy_cell_state.configure(&_cell_state_out1, cell_state_out);
+    _cell_state_out1.allocator()->allocate();
+    _copy_output.configure(output_state_out, output);
+
+    // Vector for holding the tensors to store in scratch buffer
+    std::vector<ITensor *> scratch_inputs;
+    if(!lstm_params.has_cifg_opt())
+    {
+        scratch_inputs.emplace_back(&_input_gate_out1);
+    }
+    scratch_inputs.emplace_back(&_cell_state_out1);
+    scratch_inputs.emplace_back(forget_gate_out);
+    scratch_inputs.emplace_back(output_gate_out);
+    _concat_scratch_buffer.configure(scratch_inputs, scratch_buffer);
+}
+
+Status NELSTMLayer::validate(const ITensorInfo *input,
+                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
+                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
+                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
+                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
+                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
+                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
+                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                        forget_gate_bias, cell_bias, output_gate_bias,
+                                        output_state_in, cell_state_in,
+                                        scratch_buffer, output_state_out, cell_state_out, output);
+
+    // Check data types
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
+                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                                                       forget_gate_bias, cell_bias, output_gate_bias,
+                                                       output_state_in, cell_state_in,
+                                                       scratch_buffer, output_state_out, cell_state_out, output);
+
+    // Check dimensions
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_forget_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_cell_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_forget_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_cell_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_gate_bias->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state_in->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(scratch_buffer->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
+                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+
+    const unsigned int num_batches = input->dimension(1);
+    const unsigned int num_cells   = input_to_output_weights->dimension(1);
+
+    // Check peephole optimization
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_output_weights()->num_dimensions() > 1);
+    }
+
+    TensorShape      units_out_transposed_shape = compute_transposed_shape(*recurrent_to_output_weights);
+    TensorShape      num_units_transposed_shape = compute_transposed_shape(*forget_gate_bias);
+    const TensorInfo units_out_transposed_info  = TensorInfo(units_out_transposed_shape, 1, input->data_type());
+    const TensorInfo num_units_transposed_info  = TensorInfo(num_units_transposed_shape, 1, input->data_type());
+
+    TensorInfo input_gate      = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo forget_gate     = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo output_gate_tmp = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+    TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
+
+    // Validate forget gate
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate input gate
+    if(!lstm_params.has_cifg_opt())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(),
+                                            lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo()));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+        if(lstm_params.has_peephole_opt())
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+        }
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+    }
+
+    // Validate cell state
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if(cell_threshold != 0.f)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
+                                                                                                                    cell_threshold)));
+    }
+
+    // Validate output gate tmp
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+    if(lstm_params.has_peephole_opt())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+    }
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+
+    // Validate output state
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    if(lstm_params.has_projection())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
+        if(projection_threshold != 0.f)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(output_state_out, output_state_out,
+                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+        }
+    }
+
+    // Validate copy kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(&cell_state_tmp, cell_state_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
+
+    // Validate scratch concatenation
+    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    if(!lstm_params.has_cifg_opt())
+    {
+        inputs_vector_info_raw.push_back(&input_gate);
+    }
+    inputs_vector_info_raw.push_back(&cell_state_tmp);
+    inputs_vector_info_raw.push_back(&forget_gate);
+    inputs_vector_info_raw.push_back(&output_gate_tmp);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NEWidthConcatenateLayer::validate(inputs_vector_info_raw, scratch_buffer));
+    return Status{};
+}
+
+void NELSTMLayer::run()
+{
+    _memory_group.acquire();
+
+    _fully_connected_forget_gate.run();
+    NEScheduler::get().schedule(&_transpose_forget_gate, Window::DimY);
+    _gemm_forget_gate.run();
+    NEScheduler::get().schedule(&_accum_forget_gate1, Window::DimY);
+
+    if(_run_peephole_opt)
+    {
+        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+        _accum_forget_gate2.run();
+    }
+    NEScheduler::get().schedule(&_activation_forget_gate, Window::DimY);
+
+    if(_run_cifg_opt)
+    {
+        if(_ones.info()->data_type() == DataType::F16)
+        {
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+        }
+        else
+        {
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+        }
+        NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY);
+    }
+    else
+    {
+        _fully_connected_input_gate.run();
+        NEScheduler::get().schedule(&_transpose_input_gate, Window::DimY);
+        _gemm_input_gate.run();
+        NEScheduler::get().schedule(&_accum_input_gate1, Window::DimY);
+        if(_run_peephole_opt)
+        {
+            NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+            _accum_input_gate2.run();
+        }
+        NEScheduler::get().schedule(&_activation_input_gate, Window::DimY);
+    }
+
+    _fully_connected_cell_state.run();
+    NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
+    _gemm_cell_state1.run();
+    NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY);
+    NEScheduler::get().schedule(&_activation_cell_state, Window::DimY);
+    NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
+    NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
+    NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY);
+
+    if(_perform_cell_clipping)
+    {
+        NEScheduler::get().schedule(&_cell_clip, Window::DimY);
+    }
+
+    _fully_connected_output.run();
+    NEScheduler::get().schedule(&_transpose_output, Window::DimY);
+    _gemm_output.run();
+    NEScheduler::get().schedule(&_accum_output1, Window::DimY);
+
+    if(_run_peephole_opt)
+    {
+        NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+        _accum_output2.run();
+    }
+    NEScheduler::get().schedule(&_activation_output, Window::DimY);
+
+    NEScheduler::get().schedule(&_activation_output_state, Window::DimY);
+    NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+
+    if(_has_projection_weights)
+    {
+        _fully_connected_output_state.run();
+        if(_perform_projection_clipping)
+        {
+            NEScheduler::get().schedule(&_projection_clip, Window::DimY);
+        }
+    }
+
+    NEScheduler::get().schedule(&_copy_cell_state, Window::DimY);
+    NEScheduler::get().schedule(&_copy_output, Window::DimY);
+
+    _concat_scratch_buffer.run();
+
+    _memory_group.release();
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
new file mode 100644
index 0000000..6e7d4ab
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp

@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEPriorBoxLayerKernel>();
+    k->configure(input1, input2, output, info);
+    _kernel = std::move(k);
+}
+
+Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+{
+    return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
new file mode 100644
index 0000000..0b022df
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INNEUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY NEAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+using namespace arm_compute;
+
+NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+{
+}
+
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+    ARM_COMPUTE_UNUSED(keep_dims);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+    for(unsigned int i = 0; i < reduction_axis.num_dimensions(); ++i)
+    {
+        if(output->total_size() > 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(reduction_axis[i]) != 1);
+            ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(reduction_axis[i]) > input->num_dimensions() - 1);
+        }
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output, reduction_axis[i], ReductionOperation::MEAN_SUM));
+    }
+
+    return Status{};
+}
+
+void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _reduction_ops     = reduction_axis.num_dimensions();
+    _reduction_kernels = arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
+    _reduced_outs      = arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
+    _keep_dims         = keep_dims;
+
+    // Perform reduction for every axis
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
+        out_shape.set(reduction_axis[i], 1);
+        auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
+
+        if(i == _reduction_ops - 1 && keep_dims)
+        {
+            _reduction_kernels[i].configure(in, output, reduction_axis[i], ReductionOperation::MEAN_SUM);
+        }
+        else
+        {
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(), input->info()->data_type()));
+            _memory_group.manage(_reduced_outs.get() + i);
+            _reduction_kernels[i].configure(in, _reduced_outs.get() + i, reduction_axis[i], ReductionOperation::MEAN_SUM);
+        }
+    }
+
+    // Allocate intermediate tensors
+    for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    {
+        _reduced_outs[i].allocator()->allocate();
+    }
+
+    // Configure reshape layer if we want to drop the dimensions
+    if(!keep_dims)
+    {
+        TensorShape out_shape = input->info()->tensor_shape();
+        for(unsigned int i = 0; i < _reduction_ops; ++i)
+        {
+            out_shape.remove_dimension(reduction_axis[i]);
+        }
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
+        _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
+    }
+}
+
+void NEReduceMean::run()
+{
+    _memory_group.acquire();
+
+    for(unsigned int i = 0; i < _reduction_ops; ++i)
+    {
+        _reduction_kernels[i].run();
+    }
+
+    if(!_keep_dims)
+    {
+        _reshape.run();
+    }
+    _memory_group.release();
+}

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index cd0b42f..188c2bb 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -26,8 +26,8 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 /** Define dimension to split the window
@@ -42,6 +42,10 @@
     {
         case 0:
             return Window::DimY;
+        case 1:
+        case 2:
+        case 3:
+            return Window::DimX;
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction axis");
     }
@@ -59,7 +63,7 @@
 } // namespace
 
 NEReductionOperation::NEReductionOperation()
-    : _reduction_kernel(), _fill_border_kernel(), _window_split(0)
+    : _reduction_kernel(), _fill_border_kernel(), _window_split(0), _reduction_axis()
 {
 }
 
@@ -72,20 +76,28 @@
 
 void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
 
     // Configure reduction kernel
     _reduction_kernel.configure(input, output, axis, op);
-    _window_split = reduction_window_split_dimension(axis);
+    _window_split   = reduction_window_split_dimension(axis);
+    _reduction_axis = axis;
 
-    // Configure fill border kernel
-    BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
-    BorderMode fill_border_mode = reduction_operation_border_mode(op);
-    _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+    if(axis == 0)
+    {
+        // Configure fill border kernel
+        BorderSize fill_border_size = (axis == 0) ? _reduction_kernel.border_size() : BorderSize();
+        BorderMode fill_border_mode = reduction_operation_border_mode(op);
+        _fill_border_kernel.configure(input, fill_border_size, fill_border_mode, PixelValue(static_cast<float>(0.f)));
+    }
 }
 
 void NEReductionOperation::run()
 {
-    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    if(_reduction_axis == 0)
+    {
+        NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
+    }
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
new file mode 100644
index 0000000..4ad032b
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEReorgLayerKernel>();
+    k->configure(input, output, stride);
+    _kernel = std::move(k);
+}
+
+Status NEReorgLayer::validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride)
+{
+    return NEReorgLayerKernel::validate(input, output, stride);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index fef4e0c..4600f36 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
 
 #include <utility>
@@ -36,3 +37,11 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+
+Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output));
+
+    return Status{};
+}

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 3a73f1e..9be9e68 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -36,9 +36,10 @@
 {
 }
 
-void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta)
+void NESoftmaxLayer::configure(ITensor *input, ITensor *output, float beta, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(axis);
 
     // Configure Kernels
     _max_kernel.configure(input, &_max);
@@ -58,8 +59,10 @@
     _tmp.allocator()->allocate();
 }
 
-Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
+Status NESoftmaxLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, size_t axis)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis must be 1 for NEON");
+
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Only 2D inputs are supported");

diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
new file mode 100644
index 0000000..9be96af
--- /dev/null
+++ b/src/runtime/NEON/functions/NEUpsampleLayer.cpp

@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
+
+namespace arm_compute
+{
+NEUpsampleLayer::NEUpsampleLayer()
+    : _kernel(), _data_layout()
+{
+}
+
+Status NEUpsampleLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info,
+                                 const InterpolationPolicy &policy)
+{
+    return NEUpsampleLayerKernel::validate(input, output, info, policy);
+}
+
+void NEUpsampleLayer::configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy &policy)
+{
+    _data_layout = input->info()->data_layout();
+    _kernel.configure(input, output, info, policy);
+}
+
+void NEUpsampleLayer::run()
+{
+    const auto win = (_data_layout == DataLayout::NCHW) ? Window::DimZ : Window::DimX;
+    NEScheduler::get().schedule(&_kernel, win);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 828a593..c8e3b3b 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -39,6 +39,121 @@
 {
 namespace
 {
+inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    if(input_dims.width > 4 && input_dims.height > 4)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+    }
+
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info)));
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
+inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
+                                  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
+
+    if(act_info.enabled())
+    {
+        NEActivationLayer::validate(output, nullptr, act_info);
+    }
+    return Status{};
+}
+
 inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
 {
     const DataLayout data_layout = input->info()->data_layout();
@@ -52,31 +167,19 @@
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
     ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-
     if(biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
-
-    return Status{};
+    return INEWinogradLayerTransformWeightsKernel<float>::validate(input, weights);
 }
 
 Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
 {
     Size2D output_tile = Size2D{};
-
     if(kernel_dims == Size2D(3U, 3U))
     {
         output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
@@ -85,7 +188,30 @@
     {
         output_tile = Size2D(2U, 2U);
     }
-
+    else if(kernel_dims == Size2D(1U, 3U))
+    {
+        output_tile = Size2D(1U, 6U);
+    }
+    else if(kernel_dims == Size2D(3U, 1U))
+    {
+        output_tile = Size2D(6U, 1U);
+    }
+    else if(kernel_dims == Size2D(1U, 5U))
+    {
+        output_tile = Size2D(1U, 4U);
+    }
+    else if(kernel_dims == Size2D(5U, 1U))
+    {
+        output_tile = Size2D(4U, 1U);
+    }
+    else if(kernel_dims == Size2D(7U, 1U))
+    {
+        output_tile = Size2D(2U, 1U);
+    }
+    else if(kernel_dims == Size2D(1U, 7U))
+    {
+        output_tile = Size2D(1U, 2U);
+    }
     return output_tile;
 }
 
@@ -94,7 +220,7 @@
     // Check if we want to configure a Winograd configuration which requires fast math
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> fast_math_winograd =
+    const std::vector<WinogradConfiguration> fast_math_winograd =
     {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
@@ -109,7 +235,7 @@
 } //namespace
 
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
+    : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
       _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(),
       _is_prepared(false), _is_activationlayer_enabled(false)
 {
@@ -149,48 +275,96 @@
     int n_gemms = 0;
     int N_BLOCK = 0; // Size of block used by GEMM.
 
-    switch(kernel_size.width)
+    if(kernel_size == Size2D(3, 3))
     {
-        case 3:
+        if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
         {
-            if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
-            {
-                using config             = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
-                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-            else
-            {
-                using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
-                transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
-                transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
-                transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
-                n_gemms                  = config::WinogradBase::N_GEMMS;
-                N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            }
-            break;
-        }
-        case 5:
-        {
-            using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+            using config             = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
             transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
             transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
             transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
             n_gemms                  = config::WinogradBase::N_GEMMS;
             N_BLOCK                  = config::WinogradConv::N_BLOCK;
-            break;
         }
-        default:
+        else
         {
-            ARM_COMPUTE_ERROR("Not supported.");
-            break;
+            using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
+            transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+            transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+            transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+            n_gemms                  = config::WinogradBase::N_GEMMS;
+            N_BLOCK                  = config::WinogradConv::N_BLOCK;
         }
     }
+    else if(kernel_size == Size2D(5, 5))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(1, 3))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(3, 1))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(1, 5))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(5, 1))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(1, 7))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else if(kernel_size == Size2D(7, 1))
+    {
+        using config             = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>;
+        transform_input_kernel   = support::cpp14::make_unique<config::TransformInputKernel>();
+        transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
+        transform_output_kernel  = support::cpp14::make_unique<config::TransformOutputKernel>();
+        n_gemms                  = config::WinogradBase::N_GEMMS;
+        N_BLOCK                  = config::WinogradConv::N_BLOCK;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported.");
+    }
 
-    const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID;
+    const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
     const bool        use_same_padding = use_padding_type == PADDING_SAME;
 
     // Get convolved dimensions
@@ -207,19 +381,19 @@
     const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
                                                                                          in_channels)
                                        * data_type_size
-                                       + storage_alignment - 1;
+                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
 
     // Input storage
     const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
                                                                                      use_same_padding)
                                       * data_type_size
-                                      + storage_alignment - 1;
+                                      + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
 
     // Output storage
     const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels,
                                                                                         use_same_padding)
                                        * data_type_size
-                                       + storage_alignment - 1;
+                                       + storage_alignment - 1; /* FIXME: remove alignment after COMPMID-1088 */
     ;
     const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
     const int         kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(kernel_shape);
@@ -241,6 +415,7 @@
     TensorShape a_shape(k, m, 1, n_gemms);
     Strides     a_strides(data_type_size);
     a_strides.set(1, a_strides[0] * k);
+    //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
     a_strides.set(2, 0);
     a_strides.set(3, data_type_size * input_matrix_stride);
 
@@ -252,6 +427,7 @@
     TensorShape d_shape(n, m, 1, n_gemms);
     Strides     d_strides(data_type_size);
     d_strides.set(1, data_type_size * output_matrix_row_stride);
+    //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
     d_strides.set(2, 0);
     d_strides.set(3, data_type_size * output_matrix_stride);
 
@@ -272,6 +448,8 @@
 
     // Configure the InputTransform
     _memory_group.manage(&_input_workspace);
+    _memory_group.manage(&_output_workspace);
+
     if(data_layout == DataLayout::NCHW)
     {
         // configure the kernel to transform the input tensor from NCHW -> NHWC
@@ -279,48 +457,34 @@
         _input_nhwc.allocator()->allocate();
         transform_input_kernel->configure(&_input_nhwc, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
                                           &_input_workspace, input_matrix_stride);
-    }
-    else
-    {
-        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
-                                          &_input_workspace, input_matrix_stride);
-    }
 
-    // Configure WeightsTransform
-    if(data_layout == DataLayout::NCHW)
-    {
         // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
         _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
 
         transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-    }
-    else
-    {
-        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
-        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
 
-        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-    }
-    _weights_hwio.allocator()->allocate();
-
-    // Configure OutputTransform
-    //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-
-    _memory_group.manage(&_output_workspace);
-    if(data_layout == DataLayout::NCHW)
-    {
+        //The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
         transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, &_output_nhwc,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
     else
     {
+        transform_input_kernel->configure(_input, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
+                                          &_input_workspace, input_matrix_stride);
+
+        // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+        _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 0U, 1U, 2U));
+
+        transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
+
         transform_output_kernel->configure(biases, &_output_workspace,
                                            output_matrix_stride, _output,
                                            in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels);
     }
 
-    _asm_glue.configure(&_input_workspace, &_kernel_storage, &_output_workspace, 1.0f, 0.f, false);
+    _weights_hwio.allocator()->allocate();
+    _gemm_function.configure(&_input_workspace, &_kernel_storage, nullptr, &_output_workspace, 1.0f, 0.f);
     _input_workspace.allocator()->allocate();
     _kernel_storage.allocator()->allocate();
     _output_workspace.allocator()->allocate();
@@ -355,12 +519,12 @@
         //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
         _permute_input.run();
     }
+
     // Transform input tensor to the winograd domain
     NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
 
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    _asm_glue.run();
-
+    _gemm_function.run();
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
@@ -408,97 +572,81 @@
     // Validate input transform
     const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
     const TensorInfo  input0       = input->clone()->set_tensor_shape(input0_shape);
-
-    switch(weights->dimension(idx_width))
-    {
-        case 3:
-        {
-            if(input_dims.width > 4 && input_dims.height > 4)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, &input0, winograd_info)));
-            }
-            break;
-        }
-        case 5:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, &input0, winograd_info)));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
-            break;
-        }
-    }
     // Validate filter transform
     const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
     const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
-
-    switch(weights->dimension(idx_width))
-    {
-        case 3:
-        {
-            if(input_dims.width > 4 && input_dims.height > 4)
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
-            }
-            else
-            {
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, &input1, winograd_info)));
-            }
-            break;
-        }
-        case 5:
-        {
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, &input1, winograd_info)));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
-            break;
-        }
-    }
     // Validate batched matrix multiply
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    switch(weights->dimension(idx_width))
+
+    if(kernel_size == Size2D(3, 3))
     {
-        case 3:
-        {
-            if(input_dims.width > 4 && input_dims.height > 4)
-            {
-                // Validate output transform
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
-            }
-            else
-            {
-                // Validate output transform
-                ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));
-            }
-            break;
-        }
-        case 5:
-        {
-            // Validate output transform
-            ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(&batched_mm_output, biases, output, winograd_info)));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_RETURN_ERROR_MSG("Only 3x3 and 5x5 kernels supported.");
-            break;
-        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+        return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
     }
-    // Validate Activation Layer
-    if(act_info.enabled())
+    else if(kernel_size == Size2D(5, 5))
     {
-        NEActivationLayer::validate(output, nullptr, act_info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
+        return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    if(kernel_size == Size2D(3, 1))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else if(kernel_size == Size2D(1, 3))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else if(kernel_size == Size2D(5, 1))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else if(kernel_size == Size2D(1, 5))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else if(kernel_size == Size2D(7, 1))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else if(kernel_size == Size2D(1, 7))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
+        return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
     }
     return Status{};
 }
@@ -513,8 +661,8 @@
 
         // Transform weights
         NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
-        _weights_hwio.allocator()->free();
 
+        _weights_hwio.allocator()->free();
         _is_prepared = true;
     }
 }

diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/NEON/functions/NEYOLOLayer.cpp
new file mode 100644
index 0000000..e52d054
--- /dev/null
+++ b/src/runtime/NEON/functions/NEYOLOLayer.cpp

@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
+
+#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+void NEYOLOLayer::configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    auto k = arm_compute::support::cpp14::make_unique<NEYOLOLayerKernel>();
+    k->configure(input, output, act_info, num_classes);
+    _kernel = std::move(k);
+}
+
+Status NEYOLOLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes)
+{
+    return NEYOLOLayerKernel::validate(input, output, act_info, num_classes);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
index b52ce66..c87e82a 100644
--- a/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp
+++ b/src/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/assembly/NEGEMMInterleavedWrapper.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedMatrixMultiplyWrapper.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedPrepareBWrapperKernel.h"
 #include "arm_compute/core/NEON/kernels/assembly/NEGEMMInterleavedTransformAWrapper.h"
@@ -42,7 +43,7 @@
     prepare();
 
     _memory_group.acquire();
-    NEScheduler::get().run_workloads(_workloads);
+    NEScheduler::get().run_tagged_workloads(_workloads, _tag.c_str());
     _memory_group.release();
 }
 
@@ -151,51 +152,59 @@
     const unsigned int alignment = 128;
     _transformed_b.allocator()->init(TensorInfo{}, alignment);
     _tmp_c.allocator()->init(TensorInfo{}, alignment);
+    _tag = "NEGEMMInterleaved_";
+    _tag += get_strategy_name(input_type, use_dot);
+
     if(!_pretranspose_b)
     {
         // If B is transposed at every iteration then transformed_B can be managed:
         _memory_group.manage(&_transformed_b);
+        _block_sizes = calculate_block_sizes_from_data_type(NEScheduler::get().cpu_info(), _params.M, _params.N, _params.K, input_type, use_dot);
     }
-    switch(input_type)
+    else
     {
-        case DataType::F32:
-            _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
-            break;
+        _tag += "_preB";
+        switch(input_type)
+        {
+            case DataType::F32:
+                _prepare_b = instantiate_prepareB<float>(_b, &_transformed_b, _params);
+                break;
 #ifdef __aarch64__
-        case DataType::U8:
-        case DataType::QASYMM8:
-            if(use_dot)
-            {
-                _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
-            }
-            else
-            {
-                _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
-            }
-            break;
-        case DataType::S8:
-            if(use_dot)
-            {
-                _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
-            }
-            else
-            {
-                _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
-            }
-            break;
+            case DataType::U8:
+            case DataType::QASYMM8:
+                if(use_dot)
+                {
+                    _prepare_b = instantiate_prepareB<uint8_t, true>(_b, &_transformed_b, _params);
+                }
+                else
+                {
+                    _prepare_b = instantiate_prepareB<uint8_t, false>(_b, &_transformed_b, _params);
+                }
+                break;
+            case DataType::S8:
+                if(use_dot)
+                {
+                    _prepare_b = instantiate_prepareB<int8_t, true>(_b, &_transformed_b, _params);
+                }
+                else
+                {
+                    _prepare_b = instantiate_prepareB<int8_t, false>(_b, &_transformed_b, _params);
+                }
+                break;
 #endif /* __aarch64__ */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
-            break;
+            case DataType::F16:
+                _prepare_b = instantiate_prepareB<__fp16>(_b, &_transformed_b, _params);
+                break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("DataType not supported");
-            break;
-    }
-    ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
+            default:
+                ARM_COMPUTE_ERROR("DataType not supported");
+                break;
+        }
+        ARM_COMPUTE_ERROR_ON(_prepare_b == nullptr);
 
-    _block_sizes = _prepare_b->block_sizes();
+        _block_sizes = _prepare_b->block_sizes();
+    }
 
     _block_walker.set(Window::DimX, Window::Dimension(0, ceil_to_multiple(_params.N, _block_sizes.x_block), _block_sizes.x_block));
     _block_walker.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(_params.K, _block_sizes.k_block), _block_sizes.k_block));

diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index f4253c8..2355389 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp

@@ -42,7 +42,6 @@
 OMPScheduler::OMPScheduler() // NOLINT
     : _num_threads(omp_get_max_threads())
 {
-    get_cpu_configuration(_cpu_info);
 }
 
 unsigned int OMPScheduler::num_threads() const
@@ -90,6 +89,7 @@
     }
 }
 
+#ifndef DOXYGEN_SKIP_THIS
 void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
 {
     const unsigned int num_threads = std::min(_num_threads, static_cast<unsigned int>(workloads.size()));
@@ -108,3 +108,4 @@
         workloads[tid](info);
     }
 }
+#endif /* DOXYGEN_SKIP_THIS */

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index 96f54f8..36eaf0b 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 #include "support/ToolchainSupport.h"
 
@@ -37,14 +38,7 @@
     : _allocator(allocator), _blob(), _blob_size(blob_size)
 {
     ARM_COMPUTE_ERROR_ON(!allocator);
-    _blob = _allocator->allocate(_blob_size, 0);
-}
-
-OffsetMemoryPool::~OffsetMemoryPool()
-{
-    ARM_COMPUTE_ERROR_ON(!_allocator);
-    _allocator->free(_blob);
-    _blob = nullptr;
+    _blob = _allocator->make_region(blob_size, 0);
 }
 
 void OffsetMemoryPool::acquire(MemoryMappings &handles)
@@ -55,7 +49,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        *handle.first = reinterpret_cast<uint8_t *>(_blob) + handle.second;
+        handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_size - handle.second));
     }
 }
 
@@ -64,7 +58,7 @@
     for(auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
-        *handle.first = nullptr;
+        handle.first->set_region(nullptr);
     }
 }
 

diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 293241d..5ec2ce9 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp

@@ -73,6 +73,36 @@
     _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
 }
 
+std::unique_ptr<IMemoryPool> PoolManager::release_pool()
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
+
+    if(!_free_pools.empty())
+    {
+        std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
+        ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);
+        _free_pools.pop_front();
+
+        // Update semaphore
+        _sem = arm_compute::support::cpp14::make_unique<arm_compute::Semaphore>(_free_pools.size());
+
+        return pool;
+    }
+
+    return nullptr;
+}
+
+void PoolManager::clear_pools()
+{
+    std::lock_guard<arm_compute::Mutex> lock(_mtx);
+    ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to clear the PoolManager!");
+    _free_pools.clear();
+
+    // Update semaphore
+    _sem = nullptr;
+}
+
 size_t PoolManager::num_pools() const
 {
     std::lock_guard<arm_compute::Mutex> lock(_mtx);

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index c84a271..5fa51d7 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -127,39 +127,35 @@
 
 uint8_t *TensorAllocator::data() const
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    return reinterpret_cast<uint8_t *>(_memory.region()->buffer());
+    return (_memory.region() == nullptr) ? nullptr : reinterpret_cast<uint8_t *>(_memory.region()->buffer());
 }
 
 void TensorAllocator::allocate()
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
-
     if(_associated_memory_group == nullptr)
     {
-        _memory = Memory(std::make_shared<MemoryRegion>(info().total_size(), alignment()));
+        _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(info().total_size(), alignment()));
     }
     else
     {
-        _associated_memory_group->finalize_memory(_owner, reinterpret_cast<void **>(_memory.region()->handle()), info().total_size());
-        _memory.region()->set_size(info().total_size());
+        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size());
     }
     info().set_is_resizable(false);
 }
 
 void TensorAllocator::free()
 {
-    _memory = Memory();
+    _memory.set_region(nullptr);
     info().set_is_resizable(true);
 }
 
-arm_compute::Status TensorAllocator::import_memory(Memory memory)
+arm_compute::Status TensorAllocator::import_memory(void *memory, size_t size)
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(memory.region()->buffer() == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
+    ARM_COMPUTE_RETURN_ERROR_ON(size == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(_associated_memory_group != nullptr);
-    _memory = memory;
+
+    _memory.set_owned_region(support::cpp14::make_unique<MemoryRegion>(memory, info().total_size()));
     info().set_is_resizable(false);
 
     return Status{};
@@ -167,10 +163,10 @@
 
 void TensorAllocator::set_associated_memory_group(MemoryGroup *associated_memory_group)
 {
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
     ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
     ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region()->buffer() != nullptr);
+    ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.region()->buffer() != nullptr);
+
     _associated_memory_group = associated_memory_group;
 }
commit	b9abeae0897bef74553ba9800c4ff5f74131c5b4	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Thu Nov 22 11:58:08 2018 +0000
committer	Jenkins <bsgcomp@arm.com>	Thu Nov 22 11:58:08 2018 +0000
tree	69f512b9d2d7d9169960592cca948dc6e8e5b936
parent	52ba29e936b8e711e8acdfe819e36f884d4f3fe1 [diff]