arm_compute v18.01 Change-Id: I9bfa178c2e38bfd5fc812e62aab6760d87748e05

commit: f45d5a9be1bf4d315a227b80617582b8eb4214d2 [log] [tgz]
author: Anthony Barbier <anthony.barbier@arm.com> Wed Jan 24 16:23:15 2018 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> Wed Jan 24 16:23:15 2018 +0000
tree: 29f24fc5f51448e831080d76eef3ac75d43c1934
parent: 6943bb00e79fe2ea4c127dc04b3440c5b0b29ce0 [diff]
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index de75518..22a328b 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -186,6 +186,7 @@
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
+    { "deconvolution_upsample", "deconvolution_layer.cl" },
     { "depthwise_convolution_3x3", "depthwise_convolution.cl" },
     { "depthwise_convolution_3x3_quantized", "depthwise_convolution_quantized.cl" },
     { "depthwise_im2col", "depthwise_convolution.cl" },
@@ -290,6 +291,9 @@
     { "NV21_to_RGB888_bt709", "color_convert.cl" },
     { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
+    { "permute_201", "permute.cl" },
+    { "permute_120", "permute.cl" },
+    { "permute_3201", "permute.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pooling_layer_2", "pooling_layer.cl" },
@@ -421,6 +425,10 @@
 #include "./cl_kernels/convolution_rectangle.clembed"
     },
     {
+        "deconvolution_layer.cl",
+#include "./cl_kernels/deconvolution_layer.clembed"
+    },
+    {
         "depth_convert.cl",
 #include "./cl_kernels/depth_convert.clembed"
     },
@@ -569,6 +577,10 @@
 #include "./cl_kernels/optical_flow_pyramid_lk.clembed"
     },
     {
+        "permute.cl",
+#include "./cl_kernels/permute.clembed"
+    },
+    {
         "pixelwise_mul_float.cl",
 #include "./cl_kernels/pixelwise_mul_float.clembed"
     },

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 3eb94b7..7da7438 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,12 @@
         return;
     }
 
+    // Make sure that dimensions > Z are 1
+    for(unsigned int i = 3; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_ERROR_ON((window[i].end() - window[i].start()) != 1);
+    }
+
     cl::NDRange gws = ICLKernel::gws_from_window(window);
 
     // Check for empty NDRange

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 157b6d6..726279c 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -74,8 +74,9 @@
 #define LOAD_FUNCTION_PTR(func_name, handle) \
     func_name##_ptr = reinterpret_cast<decltype(func_name) *>(dlsym(handle, #func_name));
 
-    LOAD_FUNCTION_PTR(clBuildProgram, handle);
-    LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel, handle);
+    LOAD_FUNCTION_PTR(clCreateContextFromType, handle);
+    LOAD_FUNCTION_PTR(clCreateCommandQueue, handle);
+    LOAD_FUNCTION_PTR(clGetContextInfo, handle);
     LOAD_FUNCTION_PTR(clBuildProgram, handle);
     LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel, handle);
     LOAD_FUNCTION_PTR(clSetKernelArg, handle);
@@ -125,6 +126,59 @@
 }
 } // namespace arm_compute
 
+cl_int clGetContextInfo(cl_context      context,
+                        cl_context_info param_name,
+                        size_t          param_value_size,
+                        void           *param_value,
+                        size_t         *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(context, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_command_queue clCreateCommandQueue(cl_context                  context,
+                                      cl_device_id                device,
+                                      cl_command_queue_properties properties,
+                                      cl_int                     *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
+    if(func != nullptr)
+    {
+        return func(context, device, properties, errcode_ret);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+cl_context clCreateContextFromType(const cl_context_properties *properties,
+                                   cl_device_type               device_type,
+                                   void (*pfn_notify)(const char *, const void *, size_t, void *),
+                                   void   *user_data,
+                                   cl_int *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
+    if(func != nullptr)
+    {
+        return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 cl_int clBuildProgram(
     cl_program          program,
     cl_uint             num_devices,

diff --git a/src/core/CL/cl_kernels/activation_layer_qa8.cl b/src/core/CL/cl_kernels/activation_layer_qa8.cl
index 910a93f..02668f7 100644
--- a/src/core/CL/cl_kernels/activation_layer_qa8.cl
+++ b/src/core/CL/cl_kernels/activation_layer_qa8.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,12 +89,16 @@
     // Perform activation
     data = ACTIVATION_OP(ACT, data);
 
+#if defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
     // requantize to output space
-    float16 fdata = convert_float16(data);
-    fdata         = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
-    uchar16 qdata = convert_uchar16_sat(fdata);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL);
+    data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(uchar, VEC_SIZE));
+#endif // defined(O1_VAL) && defined(O2_VAL) && defined(S1_VAL) && defined(S2_VAL)
 
     // Store result
     VSTORE(VEC_SIZE)
-    (qdata, 0, (__global DATA_TYPE *)output.ptr);
+    (data, 0, (__global DATA_TYPE *)output.ptr);
 }

diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 0341410..1296347 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl

@@ -35,40 +35,46 @@
 #define SUB(x, y) (x) - (y)
 #endif /* SATURATE */
 
-/** This function add two images.
+/** This function adds two tensors.
  *
  * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
  * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/QS8/QS16/S16/F16/F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/QS8/QS16/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void arithmetic_add(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
 {
     // Get pixels pointer
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
@@ -80,40 +86,46 @@
     vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
 }
 
-/** This function subtracts one image from another.
+/** This function subtracts one tensors from another.
  *
  * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
  * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
  * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
  *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8, S16
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void arithmetic_sub(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
 {
     // Get pixels pointer
-    Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
-    Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
-    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
     // Load values
     VEC_DATA_TYPE(DATA_TYPE_OUT, 16)

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index f7aa5eb..fbffefb 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@
     numerator = SUB_OP(data, numerator);
     x_bar     = MUL_OP(numerator, denominator);
 
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * beta.stride_x));
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
     beta_vec  = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
 
     VSTORE(VEC_SIZE)

diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
index 94ad53c..166d681 100644
--- a/src/core/CL/cl_kernels/canny.cl
+++ b/src/core/CL/cl_kernels/canny.cl

@@ -226,7 +226,7 @@
 
 #define EDGE 255
 #define hysteresis_local_stack_L1 8  // The size of level 1 stack. This has to agree with the host side
-#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate
+#define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation
 
 /** Check whether pixel is valid
  *

diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
new file mode 100644
index 0000000..2514ddc
--- /dev/null
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void deconvolution_upsample(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Store result
+    *((__global float *)dst.ptr) = *((__global float *)src.ptr);
+}

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 0cd4e71..8a757fc 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,22 +45,23 @@
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_1(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_1(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp = CONVERT(vload4(0, left_pixel), int4);
+    int8 temp0 = CONVERT(vload8(0, left_pixel), int8);
+    int2 temp1 = CONVERT(vload2(0, (left_pixel + 8 * sizeof(uchar))), int2);
 
-    int2 left   = CONVERT(temp.s01, int2);
-    int2 middle = CONVERT(temp.s12, int2);
-    int2 right  = CONVERT(temp.s23, int2);
+    int8 left   = CONVERT(temp0.s01234567, int8);
+    int8 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8);
+    int8 right  = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 2 for uchar type.
@@ -72,23 +73,23 @@
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_2(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_2(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
-    int  temp1 = CONVERT(*(left_pixel + 4 * sizeof(uchar)), int);
+    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+    int   temp1 = CONVERT(*(left_pixel + 16 * sizeof(uchar)), int);
 
-    int2 left   = CONVERT(temp0.s02, int2);
-    int2 middle = CONVERT(temp0.s13, int2);
-    int2 right  = CONVERT((int2)(temp0.s2, temp1), int2);
+    int8 left   = CONVERT(temp0.s02468ace, int8);
+    int8 middle = CONVERT(temp0.s13579bdf, int8);
+    int8 right  = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Compute a 1D horizontal convolution of size 3 and stride 3 for uchar type.
@@ -100,23 +101,23 @@
  * @param[in] input_offset  Quantized offset of zero point of the input tensor data range
  * @param[in] weight_offset Quantized offset of zero point of the weights tensor data range
  *
- * @return a int2 containing 2 convoluted values.
+ * @return a int8 containing 8 convoluted values.
  */
-inline int2 convolution1x3_stride_3(__global const uchar *left_pixel,
+inline int8 convolution1x3_stride_3(__global const uchar *left_pixel,
                                     const int             left_coeff,
                                     const int             middle_coeff,
                                     const int             right_coeff,
                                     const int             input_offset,
                                     const int             weight_offset)
 {
-    int4 temp0 = CONVERT(vload4(0, left_pixel), int4);
-    int2 temp1 = CONVERT(vload2(0, (left_pixel + 4 * sizeof(uchar))), int2);
+    int16 temp0 = CONVERT(vload16(0, left_pixel), int16);
+    int8  temp1 = CONVERT(vload8(0, (left_pixel + 16 * sizeof(uchar))), int8);
 
-    int2 left   = CONVERT(temp0.s03, int2);
-    int2 middle = CONVERT((int2)(temp0.s1, temp1.s0), int2);
-    int2 right  = CONVERT((int2)(temp0.s2, temp1.s1), int2);
+    int8 left   = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8);
+    int8 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8);
+    int8 right  = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8);
 
-    return (left + input_offset) * (int2)(left_coeff + weight_offset) + (middle + input_offset) * (int2)(middle_coeff + weight_offset) + (right + input_offset) * (int2)(right_coeff + weight_offset);
+    return (left + input_offset) * (int8)(left_coeff + weight_offset) + (middle + input_offset) * (int8)(middle_coeff + weight_offset) + (right + input_offset) * (int8)(right_coeff + weight_offset);
 }
 
 /** Apply a 3x3 convolution matrix to a single channel QASYMM8 input image and return the result.
@@ -144,9 +145,9 @@
  * @param[in] output_shift      Output scale divisor exponent
  * @param[in] bias              (Optional) Bias value
  *
- * @return a uchar2 containing 2 convoluted values.
+ * @return a uchar8 containing 8 convoluted values.
  */
-inline uchar2 convolution3x3(
+inline uchar8 convolution3x3(
     Image      *src,
     const uchar mat0, const uchar mat1, const uchar mat2,
     const uchar mat3, const uchar mat4, const uchar mat5,
@@ -159,20 +160,20 @@
 #endif //defined(HAS_BIAS)
 )
 {
-    int2 pixels;
+    int8 pixels;
 
     pixels = convolution1x3(offset(src, 0, 0), mat0, mat1, mat2, input_offset, weight_offset);
     pixels += convolution1x3(offset(src, 0, 1), mat3, mat4, mat5, input_offset, weight_offset);
     pixels += convolution1x3(offset(src, 0, 2), mat6, mat7, mat8, input_offset, weight_offset);
 #if defined(HAS_BIAS)
-    pixels += (int2)(bias);
+    pixels += (int8)(bias);
 #endif //defined(HAS_BIAS)
 
-    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 2);
+    pixels = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels, output_multiplier, output_shift, 8);
     pixels = pixels + output_offset;
     pixels = clamp(pixels, 0, 255);
 
-    return CONVERT(pixels, uchar2);
+    return CONVERT(pixels, uchar8);
 }
 
 /** This function computes the horizontal integral of the image.
@@ -241,7 +242,7 @@
     int bias_value = *((__global int *)(vector_offset(&biases, get_global_id(2))));
 #endif //defined(HAS_BIAS)
 
-    uchar2 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
+    uchar8 pixels = convolution3x3(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,
                                    weights_values1.s0, weights_values1.s1, weights_values1.s2,
                                    weights_values2.s0, weights_values2.s1, weights_values2.s2,
                                    input_offset, weight_offset, output_offset,
@@ -252,7 +253,7 @@
 #endif //defined(HAS_BIAS)
                                   );
 
-    vstore2(pixels, 0, dst.ptr);
+    vstore8(pixels, 0, dst.ptr);
 }
 
 #endif //defined(CONV_STRIDE_X)

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 768f7ee..02c6c4c 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,7 +123,7 @@
 
 #define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_z, mod_size)
+                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
 
 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)

diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 5d3a607..3d37fbc 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -124,7 +124,7 @@
         for(; xc < CELL_WIDTH; xc++)
         {
             const float mag_value   = *((__global short *)mag_row_ptr + xc);
-            const float phase_value = *(mag_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
+            const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
             const float w1          = phase_value - floor(phase_value);
 
             // The quantised phase is the histogram index [0, NUM_BINS - 1]

diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
index 507e85c..e1131d5 100644
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl

@@ -29,6 +29,9 @@
  * - Determinant less than DETERMINANT_THR
  * - or minimum eigenvalue is smaller then EIGENVALUE_THR
  *
+ * The thresholds for the determinant and the minimum eigenvalue is
+ * defined by the OpenVX spec
+ *
  * Note: Also lost tracking happens when the point tracked coordinate is outside
  * the image coordinates
  *
@@ -265,7 +268,7 @@
 
     float4 w;
     w    = round(w_scharr * (float4)D0);
-    w.s3 = D0 - w.s0 - w.s1 - w.s2;
+    w.s3 = D0 - w.s0 - w.s1 - w.s2; // Added for matching VX implementation
 
     // G.s0 = A11, G.s1 = A12, G.s2 = A22, G.s3 = min_eig
     int4 iG = (int4)0;
@@ -306,7 +309,7 @@
             // Compute bilinear interpolation for iyval
             old_i.s2 = dot(px, w_scharr);
 
-            // Rounding (it could be omitted)
+            // Rounding (it could be omitted. Used just for matching the VX implementation)
             int4 iold = convert_int4(round(old_i));
 
             // Accumulate values in the Spatial Gradient Matrix
@@ -349,8 +352,8 @@
  * @param[in]      border_limits                           It stores the right border limit (width - window_dimension - 1, height - window_dimension - 1,)
  * @param[in]      eig_const                               1.0f / (float)(2.0f * window_dimension * window_dimension)
  * @param[in]      level0                                  It is set to 1 if level of pyramid = 0
- * @param[in]      term_iteration                          It is set to 1 if termination = TERM_CRITERIA_ITERATIONS
- * @param[in]      term_epsilon                            It is set to 1 if termination = TERM_CRITERIA_EPSILON
+ * @param[in]      term_iteration                          It is set to 1 if termination = VX_TERM_CRITERIA_ITERATIONS
+ * @param[in]      term_epsilon                            It is set to 1 if termination = VX_TERM_CRITERIA_EPSILON
  */
 void __kernel lktracker_stage1(
     IMAGE_DECLARATION(new_image),

diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
new file mode 100644
index 0000000..6f978c9
--- /dev/null
+++ b/src/core/CL/cl_kernels/permute.cl

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN)
+/** Perform a DCHW -> DHWC permute operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_201(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
+}
+
+/** Perform a DCHW -> DWCH permute operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_120(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
+}
+
+/** Perform a DCHW -> HWCD permute operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute_3201(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN)

diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
index d249aa6..6ffb7e4 100644
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ b/src/core/CL/cl_kernels/warp_perspective.cl

@@ -48,7 +48,7 @@
     const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
     // transform [z,z+1,z+2,z+3]
     const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
-    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with reference implementation
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
     // transform [x,x+1,x+2,x+3]
     const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
     // transform [y,y+1,y+2,y+3]

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index eecc94f..d85de88 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -145,18 +145,21 @@
         build_opts.emplace(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
         build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
-        // Set scale and offset of the input and output
-        if(is_data_type_quantized_asymmetric(dt))
+        // Set scale and offset of the input and output if they have different quantization info
+        if(is_data_type_quantized_asymmetric(dt) && output != nullptr)
         {
-            float s1 = input->info()->quantization_info().scale;
-            int   o1 = input->info()->quantization_info().offset;
-            // If output is nullptr, assume same quantization scale/offset as input
-            float s2 = output != nullptr ? output->info()->quantization_info().scale : s1;
-            int   o2 = output != nullptr ? output->info()->quantization_info().offset : o1;
-            build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
-            build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
-            build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
-            build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+            const float s1 = input->info()->quantization_info().scale;
+            const float s2 = output->info()->quantization_info().scale;
+            const int   o1 = input->info()->quantization_info().offset;
+            const int   o2 = output->info()->quantization_info().offset;
+
+            if(o1 != o2 || s1 != s2)
+            {
+                build_opts.emplace(("-DS1_VAL=" + float_to_string_with_full_precision(s1)));
+                build_opts.emplace(("-DS2_VAL=" + float_to_string_with_full_precision(s2)));
+                build_opts.emplace(("-DO1_VAL=" + support::cpp11::to_string(o1)));
+                build_opts.emplace(("-DO2_VAL=" + support::cpp11::to_string(o2)));
+            }
         }
     }
     else

diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 2789573..75701ee 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp

@@ -154,14 +154,16 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input1, slice);
+        add_3D_tensor_argument(idx, _input2, slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index cc2ef1f..8308aa0 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp

@@ -146,15 +146,16 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_2D();
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input1, slice);
-        add_2D_tensor_argument(idx, _input2, slice);
-        add_2D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _input1, slice);
+        add_3D_tensor_argument(idx, _input2, slice);
+        add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
new file mode 100644
index 0000000..5c08d5b
--- /dev/null
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
+    : _input(nullptr), _output(nullptr), _inner_border(), _info()
+{
+}
+
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border,
+                                                    const PadStrideInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) == 0);
+
+    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1, "inner_border_right must be smaller that stride_x");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1, "inner_border_top must be smaller that stride_y");
+
+    return Status{};
+}
+
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                                                   const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _input        = input;
+    _output       = output;
+    _inner_border = inner_border;
+    _info         = info;
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), inner_border, info));
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("deconvolution_upsample"));
+
+    constexpr unsigned int num_elems_processed_per_iteration = 1;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal output_access(output->info(), 0, 0, num_elems_processed_per_iteration);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    ICLKernel::configure(win);
+}
+
+void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const int out_start_x = _info.pad().first;
+    const int out_end_x   = _output->info()->dimension(0) - _inner_border.right - _info.pad().first + _info.stride().first - 1;
+    const int out_step_x  = _info.stride().first;
+
+    const int out_start_y = _inner_border.top + _info.pad().second;
+    const int out_end_y   = _output->info()->dimension(1) - _info.pad().second + _info.stride().second - 1;
+    const int out_step_y  = _info.stride().second;
+
+    Window slice_out = window.first_slice_window_2D();
+    slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
+    slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
+
+    Window slice_in = window.first_slice_window_2D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, slice_in);
+        add_2D_tensor_argument(idx, _output, slice_out);
+        enqueue(queue, *this, slice_out);
+    }
+    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_2D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 0275d4f..9b30c64 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -103,7 +103,7 @@
     AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index ddc3a2d..1c0fe99 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,32 +33,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 using namespace arm_compute;
-
-namespace
-{
-/** Calculates expected output shape dimension
- *
- * @param[in] Input shape
- *
- * @return Expected output shape
- */
-TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
-{
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
-
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
-
-    TensorShape output_shape = input_shape;
-    output_shape.set(0, output_width);
-    output_shape.set(1, output_height);
-
-    return output_shape;
-}
-} // namespace
+using namespace arm_compute::misc::shape_calculator;
 
 CLDepthwiseConvolutionLayer3x3Kernel::CLDepthwiseConvolutionLayer3x3Kernel()
     : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0)
@@ -91,7 +70,7 @@
     }
 
     // Get convolved dimensions
-    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(),
@@ -179,9 +158,9 @@
     }
 
     // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 2;
-    const unsigned int num_elems_written_per_iteration   = 2;
-    const unsigned int num_elems_read_per_iteration      = 3 + _conv_stride_x;
+    const unsigned int num_elems_processed_per_iteration = 8 / data_size_from_type(input->info()->data_type());
+    const unsigned int num_elems_written_per_iteration   = num_elems_processed_per_iteration;
+    const unsigned int num_elems_read_per_iteration      = 3 + (num_elems_processed_per_iteration - 1) * _conv_stride_x;
     const unsigned int num_rows_read_per_iteration       = 3;
 
     Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));

diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index 36ba06d..da02227 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp

@@ -96,11 +96,11 @@
     AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
     if(_run_derivative_x && _run_derivative_y)
     {
-        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+        input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_read_rows_per_iteration);
     }
     else if(_run_derivative_x)
     {
-        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration);
+        input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
     }
     else if(_run_derivative_y)
     {

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 4f75311..76fdb6d 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -117,6 +117,7 @@
     TensorShape output_shape = get_output_shape(input->tensor_shape(), weights->tensor_shape(), conv_info);
 
     // Output auto inizialitation if not yet initialized
+    //input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
@@ -284,6 +285,7 @@
     TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
 
     // Output auto inizialitation if not yet initialized
+    //input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output->info(),
                        output_shape,
                        1,

diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 7741f12..6886f54 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp

@@ -33,8 +33,55 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    unsigned int           num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+    bool                   window_changed                      = false;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
     : _input(nullptr), _output(nullptr)
@@ -43,22 +90,13 @@
 
 void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
-                                                  DataType::U16, DataType::S16, DataType::QS16,
-                                                  DataType::U32, DataType::S32,
-                                                  DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, input->info()->dimension(0) * 4);
-    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
@@ -68,20 +106,9 @@
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
 
     // Configure kernel window
-    const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    ICLKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
     // Set config_id for enabling LWS tuning
     _config_id = "interleave4x4_";
@@ -92,6 +119,14 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
+Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
 void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 1d9fe4b..423592b 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp

@@ -46,6 +46,76 @@
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed,
+                                                        ElementsProcessed &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+    Window win{};
+    bool   window_changed = false;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if(is_interleaved_transposed)
+    {
+        // Configure window
+        num_elems_processed_per_iteration_x                        = 16;
+        num_elems_processed_per_iteration_y                        = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+        constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_read_per_iteration_input0, 1);
+        AccessWindowRectangle input1_access(input1, 0, 0, num_elems_read_per_iteration_input1, 1);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    }
+    else
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
+        num_elems_processed_per_iteration_x = 16;
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+        // Configure window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
@@ -53,72 +123,37 @@
 
 void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-    }
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
 
-    CLBuildOptions build_opts;
+    ElementsProcessed num_elements_processed{};
 
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    std::string    kernel_name(" ");
     if(is_interleaved_transposed)
     {
-        // Create kernel and set static arguments
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm_interleaved_transposed", build_opts.options()));
-
-        // Configure window
-        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-        constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
-        constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
-
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
-        AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
+        kernel_name = "gemmlowp_mm_interleaved_transposed";
     }
     else
     {
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-        constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-        const unsigned int     num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
-
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_mm", build_opts.options()));
-
-        // Configure window
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
-        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
+        kernel_name = "gemmlowp_mm";
     }
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Set config_id for enabling LWS tuning
     _config_id = "gemmlowp_";
@@ -132,6 +167,20 @@
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
+Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              is_interleaved_transposed,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
 void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index 2877a74..d05939f 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -44,6 +44,81 @@
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                          int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(2);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+                                                        int32_t a_offset, int32_t b_offset)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    bool                   window_changed                    = false;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win,
+                                                                 mm_result_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_col_access);
+    }
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_row_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
     : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
 {
@@ -51,7 +126,16 @@
 
 void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                  a_offset, b_offset)); // NOLINT
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -59,74 +143,36 @@
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
-
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
     }
-
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
-
-        // Validate batches
-        TensorShape output_shape = mm_result->info()->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
-        {
-            TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
-            vector_sum_row_shape.collapse_from(1);
-            output_shape.collapse_from(2);
-
-            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
-
-            if(a_offset != 0)
-            {
-                TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
-                vector_sum_col_shape.collapse_from(1);
-
-                ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
-                                         && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                         "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
-            }
-        }
-
-        build_opts.add_option("-DB_OFFSET=" + support::cpp11::to_string(b_offset));
-    }
-
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
 
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window(mm_result->info(),
+                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                    a_offset, b_offset); // NOLINT
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                                                    int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              a_offset, b_offset)
+                                .first); // NOLINT
 
-    update_window_and_padding(win, mm_result_access);
-
-    if(a_offset != 0)
-    {
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
-        update_window_and_padding(win, vector_sum_col_access);
-    }
-
-    if(b_offset != 0)
-    {
-        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-        update_window_and_padding(win, vector_sum_row_access);
-    }
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMLowpOffsetContributionKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index bcf04b0..6951512 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp

@@ -44,6 +44,59 @@
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Status{};
+}
+std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 1;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 ICLGEMMLowpReductionKernel::ICLGEMMLowpReductionKernel()
     : _input(), _output()
 {
@@ -51,8 +104,9 @@
 
 void CLGEMMLowpMatrixAReductionKernel::configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
 
     _input  = mtx_a;
     _output = vector_sum_row;
@@ -64,21 +118,18 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
 
-    const unsigned int num_elems_processed_per_iteration = 1;
-
     // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+Status CLGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get()).first);
 
-    update_window_and_padding(win,
-                              input_access,
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMLowpMatrixAReductionKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -107,8 +158,8 @@
 
 void CLGEMMLowpMatrixBReductionKernel::configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
 
     _input  = mtx_b;
     _output = vector_sum_col;
@@ -121,21 +172,18 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_b_reduction", build_opts.options()));
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
+    auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+}
 
-    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), num_elems_processed_per_iteration), _input->info()->dimension(1));
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+Status CLGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
 
-    update_window_and_padding(win,
-                              input_access,
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMLowpMatrixBReductionKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
index 015b4f7..d5c93dd 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp

@@ -36,6 +36,37 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
+    ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+                                                        unsigned int &num_elems_processed_per_iteration)
+{
+    // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+    num_elems_processed_per_iteration = (gpu_target == GPUTarget::BIFROST) ? 8 : 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1));
+    AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
     : _accum(nullptr), _biases(nullptr)
 {
@@ -43,18 +74,21 @@
 
 void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum);
-    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
 
     _biases = biases;
     _accum  = accum;
 
     // Get the target architecture
-    GPUTarget arch_target = get_arch_from_target(get_target());
-    // Select the vector size to use (8 for Bifrost; 16 for Midgard).
-    const unsigned int vector_size = (arch_target == GPUTarget::BIFROST) ? 8 : 16;
+    GPUTarget    arch_target = get_arch_from_target(get_target());
+    unsigned int vector_size = 0;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(accum->info(), biases->info(), arch_target, vector_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
 
     // Add build options
     CLBuildOptions build_opts;
@@ -65,18 +99,15 @@
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
 
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = vector_size;
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target)
+{
+    unsigned int num_elems_processed_per_iteration = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first);
 
-    Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1));
-    AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, biases_access, accum_access);
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 16706dd..19f38bf 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -42,6 +42,81 @@
 
 using namespace arm_compute;
 
+namespace
+{
+using ElementsProcessed = Steps;
+
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    if(!is_interleaved_transposed)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+    }
+
+    return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
+                                                               bool is_interleaved_transposed, GPUTarget gpu_target,
+                                                               ElementsProcessed &num_elements_processed)
+{
+    bool   window_changed = false;
+    Window win{};
+
+    const DataType data_type                           = input0->data_type();
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+
+    if(is_interleaved_transposed)
+    {
+        // Configure kernel window
+        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
+        num_elems_processed_per_iteration_y = 4;
+
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    }
+    else // The input tensors have not been reshaped
+    {
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
+        num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
+
+        // Create kernels according to the architecture, data type and input size.
+        if(gpu_target == GPUTarget::BIFROST && data_type == DataType::F32)
+        {
+            num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
+        }
+
+        // Configure window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowStatic    input0_access(input0, 0, 0, input0->dimension(0), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
+        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
@@ -49,13 +124,10 @@
 
 void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-    }
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
 
     _input0 = input0;
     _input1 = input1;
@@ -82,14 +154,19 @@
         _lws_hint = cl::NDRange(8, 8);
     }
 
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, arch_target, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos));
 
-    const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f;
-
     // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications.
-    if(multiply_alpha)
+    if(std::abs(1.0f - alpha) > 0.00001f)
     {
         build_opts.add_option_if_else(is_data_type_fixed_point(data_type),
                                       "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))),
@@ -108,49 +185,19 @@
         {
             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
         }
-
-        // Configure kernel window
-        const unsigned int     num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
     }
     else // The input tensors have not been reshaped
     {
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
 
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
-        unsigned int       num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
-        const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4);
-
         // Create kernels according to the architecture, data type and input size.
         if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
         {
             // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
             // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
             // FC6 and FC7 of AlexNet and VGG-16).
-            if(input1->info()->dimension(0) <= 1000)
-            {
-                // Each work-item processes 2 elements in the X dimension.
-                num_elems_processed_per_iteration_x = 2;
-                kernel_name                         = "gemm_mm_floating_point_f32_bifrost_1000";
-            }
-            else
-            {
-                // Each work-item processes 4 elements in the X dimension (as in the default case).
-                num_elems_processed_per_iteration_x = 4;
-                kernel_name                         = "gemm_mm_floating_point_f32_bifrost";
-            }
+            kernel_name = (input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1) ? "gemm_mm_floating_point_f32_bifrost_1000" : "gemm_mm_floating_point_f32_bifrost";
+
             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
             // via exhaustive autotuning over a range of representative layer configurations.
             _lws_hint = cl::NDRange(4);
@@ -164,23 +211,8 @@
             build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
             kernel_name = "gemm_mm_floating_point";
         }
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y));
-        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
-
-        // Configure window
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowStatic    input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y));
-        AccessWindowStatic    input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1));
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
-
-        ICLKernel::configure(win);
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
+        build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
     }
 
     // Create kernel
@@ -198,6 +230,22 @@
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
+                                                              input1->clone().get(),
+                                                              output->clone().get(),
+                                                              is_interleaved_transposed,
+                                                              gpu_target,
+                                                              num_elements_processed)
+                                .first);
+
+    return Status{};
+}
+
 void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 35074f9..69a545b 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp

@@ -33,36 +33,82 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <cmath>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_transpose1xW_with_element_size_shape(*input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+{
+    num_elems_processed_per_iteration = 16 / input->element_size();
+
+    const int scale_x        = num_elems_processed_per_iteration;
+    bool      window_changed = false;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    if((win.x().end() / scale_x) == 0)
+    {
+        return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Transposed shape would be 0 in the second dimension"), win);
+    }
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
 
 void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
-                                                  DataType::U16, DataType::S16, DataType::QS16,
-                                                  DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t transpose_w = 16 / input->info()->element_size();
-    output_shape.set(0, input->info()->dimension(1) * transpose_w);
-    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info())));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const int          scale_x                           = num_elems_processed_per_iteration;
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 1;
+    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure(win_config.second);
+
     /*
      * Following an example of how the transposition1xW works when the input data type is F32
      *
@@ -76,20 +122,15 @@
     // Create kernel
     std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+}
 
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    unsigned int num_elems_processed_per_iteration = 1;
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
 
-    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
-
-    ICLKernel::configure(win);
+    return Status{};
 }
 
 void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 6514d6c..0e9f2c5 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -39,6 +39,24 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
 CLIm2ColKernel::CLIm2ColKernel()
     : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
 {
@@ -46,9 +64,10 @@
 
 void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
@@ -184,6 +203,15 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_UNUSED(kernel_dims);
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_UNUSED(has_bias);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    return Status{};
+}
+
 void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON(_run_func == nullptr);

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 508fb89..a3af5b0 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,7 @@
 
     update_window_and_padding(win, input0_access, input1_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
 }

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
new file mode 100644
index 0000000..132de60
--- /dev/null
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+CLPermuteKernel::CLPermuteKernel()
+    : _input(nullptr), _output(nullptr), _perm()
+{
+}
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
+{
+    TensorShape output_shape = input->tensor_shape();
+    permute(output_shape, perm);
+    return output_shape;
+}
+} // namespace
+
+void CLPermuteKernel::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::U16, DataType::S16, DataType::QS16,
+                                                  DataType::U32, DataType::S32,
+                                                  DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MSG(input->info()->num_dimensions() < 3, "Invalid input size!");
+    ARM_COMPUTE_ERROR_ON_MSG(
+        (perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))) && (perm.num_dimensions() != 4 && ((perm[0] != 2 && perm[1] != 0
+                && perm[2] != 1)
+                || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))),
+        "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+
+    _input  = input;
+    _output = output;
+    _perm   = perm;
+
+    const TensorShape output_shape = get_output_shape(input->info(), perm);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    // Create kernel
+    std::set<std::string> build_opts;
+
+    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+    // Run [2, 0, 1] permute
+    if(_perm[0] == 2 && _perm[1] == 0 && _perm[2] == 1)
+    {
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
+    }
+    // Run [1, 2, 0] permute
+    else if(_perm[0] == 1 && _perm[1] == 2 && _perm[2] == 0)
+    {
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
+    }
+    // Run [3, 2, 0, 1] permute
+    else if(_perm[0] == 3 && _perm[1] == 2 && _perm[2] == 0 && _perm[3] == 1)
+    {
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported.");
+    }
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    ICLKernel::configure(win);
+}
+
+void CLPermuteKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window slice_in = window.first_slice_window_4D();
+    Window slice_out(slice_in);
+
+    // Setup output slice
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_out.set(3, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        auto         collapsed_slice_in  = slice_in.collapse(ICLKernel::window(), 2);
+        auto         collapsed_slice_out = slice_out.collapse(ICLKernel::window(), 2);
+        unsigned int idx                 = 0;
+        add_4D_tensor_argument(idx, _input, collapsed_slice_in);
+        add_4D_tensor_argument(idx, _output, collapsed_slice_out);
+        enqueue(queue, *this, collapsed_slice_in);
+    }
+    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index fd5e5d5..6dba9c0 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -227,7 +227,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice = window.first_slice_window_3D();
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
@@ -237,5 +238,5 @@
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice);
     }
-    while(window.slide_window_slice_3D(slice));
+    while(collapsed.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index ac368c7..860cc92 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,39 +123,26 @@
     const int input_width  = input->dimension(0);
     const int input_height = input->dimension(1);
 
-    unsigned int num_elems_processed_per_iteration = 1;
+    // Change the number of elements processed per iteration
+    // for pooling 3x3 with stride less equal than 3
+    const bool         can_optimize                      = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+    const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1;
+    const int          num_elems_read_per_iteration      = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size;
 
-    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
-    {
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+    // Number of iterations in X dimension
+    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
 
-        int num_elems_read_per_iteration = pool_size;
-        if(is_pool3x3_stride_le3)
-        {
-            // Change the number of elements processed and the number of elements read per iteration
-            // for pooling 3x3 with stride less equal than 3
-            num_elems_processed_per_iteration = 4;
-            num_elems_read_per_iteration      = pool_size * (pool_stride_x + 1);
-        }
+    // Upper limit for the number of right/bottom border elements that are accessed
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
-    else
-    {
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
+    border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowRectangle  input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, num_elems_read_per_iteration, pool_size,
+                                       pool_stride_x * num_elems_processed_per_iteration, pool_stride_y);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -305,8 +292,12 @@
     {
         // Upsample input by pool size
         Window in_slice(slice);
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
-        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x,
+                                                     (in_slice.x().end() - pool_pad_x) * pool_stride_x,
+                                                     pool_stride_x * _num_elems_processed_per_iteration));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y,
+                                                     (in_slice.y().end() - pool_pad_y) * pool_stride_y,
+                                                     pool_stride_y));
 
         // Set inputs
         unsigned int idx = 0;

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 4b137b0..298c700 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -37,13 +38,6 @@
 
 namespace
 {
-TensorShape get_output_shape(const ITensorInfo *input, const PermutationVector &perm)
-{
-    TensorShape output_shape = input->tensor_shape();
-    permute(output_shape, perm);
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
@@ -51,56 +45,85 @@
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 3, "Invalid input size!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0)),
-                                    "Only [2, 0, 1] and [1, 2, 0] permutation is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (perm.num_dimensions() != 3 && ((perm[0] != 2 && perm[1] != 0 && perm[2] != 1) || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))) && (perm.num_dimensions() != 4 && ((perm[0] != 2 && perm[1] != 0
+                && perm[2] != 1)
+                || (perm[0] != 1 && perm[1] != 2 && perm[2] != 0))),
+        "Only [2, 0, 1],[1, 2, 0] and [3, 2, 0, 1] permutation is supported");
+
+    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, perm));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
 
     return Status{};
 }
+
+template <typename T>
+inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
+{
+    const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
+    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    {
+        dimensions[perm[i]] = old_dim[i];
+    }
+}
+
 } // namespace
 
 template <typename T>
 void CPPPermuteKernel::run_permute(const Window &window)
 {
-    const int output_stride_x = _output->info()->strides_in_bytes().x();
-    const int output_stride_y = _output->info()->strides_in_bytes().y();
-    const int output_stride_z = _output->info()->strides_in_bytes().z();
-
-    Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
+    Strides strides      = _output->info()->strides_in_bytes();
+    Strides perm_strides = strides;
+    permute_strides(perm_strides, _perm);
+    const int               output_stride_w = strides[3];
+    Window                  window_out(window);
+    const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
+    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    {
+        window_out.set(d, zero_window);
+    }
     // Create iterators
     Iterator in(_input, window);
     Iterator out(_output, window_out);
-
-    // Run [2, 0, 1] permute
-    if(_perm[0] == 2 && _perm[1] == 0 && _perm[2] == 1)
+    ARM_COMPUTE_ERROR_ON(_perm.num_dimensions() > _input->info()->num_dimensions());
+    if(_input->info()->num_dimensions() <= 3)
     {
         execute_window_loop(window, [&](const Coordinates & id)
         {
-            const int idx                             = id.y() * output_stride_z + id.x() * output_stride_y + id.z() * output_stride_x;
+            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
             *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
         },
         in, out);
     }
-    // Run [1, 2, 0] permute
-    else
+    else if(_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
+        if(_perm.num_dimensions() < _input->info()->num_dimensions())
         {
-            const int idx                             = id.x() * output_stride_z + id.z() * output_stride_y + id.y() * output_stride_x;
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+            // special case: perm.size = 3 and tensor size > 3, _perm[3] would be invalid so we handle this with id[3] * output_stride_w instead of id[_perm[3]]
+            ARM_COMPUTE_ERROR_ON(_perm.num_dimensions() < 3);
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * output_stride_w;
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
+        }
+        else
+        {
+            execute_window_loop(window, [&](const Coordinates & id)
+            {
+                const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
+        }
     }
 }
 
@@ -112,9 +135,9 @@
 void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
+    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input->info(), perm);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), perm)));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), perm));

diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 53a10f9..0b9cd3f 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -190,7 +190,6 @@
 const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
 {
     { "absdiff", "absdiff.cs" },
-    { "col2im", "convolution_layer.cs" },
     { "direct_convolution1x1", "direct_convolution1x1.cs" },
     { "direct_convolution3x3", "direct_convolution3x3.cs" },
     { "direct_convolution5x5", "direct_convolution5x5.cs" },
@@ -207,9 +206,11 @@
     { "gemm_mm_interleaved_transposed", "gemm.cs" },
     { "gemm_mm_floating_point", "gemm.cs" },
     { "gemm_transpose1x4", "gemm.cs" },
+    { "reshape_to_columns", "convolution_layer.cs" },
     { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" },
     { "im2col_generic", "convolution_layer.cs" },
     { "im2col_reduced", "convolution_layer.cs" },
+    { "col2im", "convolution_layer.cs" },
     { "transpose", "transpose.cs" },
     { "activation_layer", "activation_layer.cs" },
     { "softmax_layer_max", "softmax_layer.cs" },
@@ -220,6 +221,10 @@
     { "batchnormalization_layer", "batchnormalization_layer.cs" },
     { "concatenate_depth", "concatenate.cs" },
     { "dropout", "dropout.cs" },
+    { "normalize_planar_yuv_layer", "normalize_planar_yuv_layer.cs" },
+    { "scale_nearest_neighbour", "scale.cs" },
+    { "arithmetic_add", "arithmetic_add.cs" },
+    { "depthwise_convolution_3x3", "depthwise_convolution3x3.cs" },
 };
 
 const std::map<std::string, std::string> GCKernelLibrary::_program_source_map =
@@ -289,11 +294,27 @@
         "dropout.cs",
 #include "./cs_shaders/dropout.csembed"
     },
+    {
+        "normalize_planar_yuv_layer.cs",
+#include "./cs_shaders/normalize_planar_yuv_layer.csembed"
+    },
+    {
+        "scale.cs",
+#include "./cs_shaders/scale.csembed"
+    },
+    {
+        "arithmetic_add.cs",
+#include "./cs_shaders/arithmetic_add.csembed"
+    },
+    {
+        "depthwise_convolution3x3.cs",
+#include "./cs_shaders/depthwise_convolution3x3.csembed"
+    },
 #endif /* EMBEDDED_KERNELS */
 };
 
 GCKernelLibrary::GCKernelLibrary()
-    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _own_context(false), _shader_path("./"), _programs_map(), _built_programs_map()
+    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _shader_path("./"), _programs_map(), _built_programs_map()
 {
 }
 
@@ -360,21 +381,9 @@
         FIRST,
         SKIP_COMMENTS = FIRST,
         RESOLVE_INCLUDES,
-        SKIP_PREPROCESSOR_DIRECTIVES,
-        SEARCH_MACRO_DEFINITIONS,
-        EXPAND_MACRO_USES,
         LAST
     };
 
-    struct MacroDefinitionInfo
-    {
-        const std::vector<std::string> param_list;
-        const std::string              content;
-    };
-
-    // Found macro definitions so far
-    std::map<const std::string, const MacroDefinitionInfo> macro_definitions;
-
     // Define a GLES compute shader parser function
     std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
     cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string
@@ -396,35 +405,6 @@
             case ParserStage::RESOLVE_INCLUDES:
                 search_pattern = R"rgx((?:^|\n)[ \t]*#include "(.*)")rgx";
                 break;
-            case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
-                search_pattern = R"((^|\n)[ \t]*(#ifdef|#ifndef|#if)[^\n]+)";
-                break;
-            case ParserStage::SEARCH_MACRO_DEFINITIONS:
-                search_pattern = R"((?:^|\n)[ \t]*#define[ \t]+(\w+)(?:\((\w+(?:[ \t]*,[ \t]*\w+)*)\))?(?: |\t|\\\n)*((?:(?:[^\\\n]|\\[^\n])*\\+\n)*(?:[ \t]*[^ \t\n]+)*)[ \t]*)";
-                break;
-            case ParserStage::EXPAND_MACRO_USES:
-            {
-                if(macro_definitions.empty())
-                {
-                    // Nothing to expand
-                    return src;
-                }
-                int i = 0;
-                for(auto &def : macro_definitions)
-                {
-                    if(i == 0)
-                    {
-                        search_pattern = R"((\b)" + def.first;
-                    }
-                    else
-                    {
-                        search_pattern += R"(\b|\b)" + def.first;
-                    }
-                    i++;
-                }
-                search_pattern += R"(\b))";
-                break;
-            }
             default:
                 break;
         }
@@ -449,126 +429,7 @@
                     dst.append(cs_parser(read_file(source_name, false), ParserStage::FIRST, 0));
                     break;
                 }
-                case ParserStage::SEARCH_MACRO_DEFINITIONS:
-                {
-                    std::regex                     params_regex(R"(\b\w+\b)");
-                    const std::string              macro_param_str = match.str(2);
-                    const std::vector<std::string> macro_param_list(
-                        std::sregex_token_iterator(macro_param_str.begin(),
-                                                   macro_param_str.end(),
-                                                   params_regex),
-                        std::sregex_token_iterator());
-
-                    const MacroDefinitionInfo info =
-                    {
-                        macro_param_list,
-                        match.str(3)
-                    };
-                    // Collect the macro definition data and not change the shader source
-                    macro_definitions.insert(std::pair<const std::string, const MacroDefinitionInfo>(match.str(1), info));
-                    dst.append(match.str());
-                    break;
-                }
-                case ParserStage::EXPAND_MACRO_USES:
-                {
-                    ptrdiff_t                args_str_length = 0;
-                    std::vector<std::string> args_list;
-
-                    // Walk through argument list, because the regular expression does NOT support nested parentheses
-                    size_t cur_args_str_pos = match.position() + match.length();
-                    if(src[cur_args_str_pos++] == '(')
-                    {
-                        int       nested_parentheses = 0;
-                        ptrdiff_t cur_arg_pos        = cur_args_str_pos;
-                        ptrdiff_t cur_arg_length     = 0;
-
-                        args_str_length++;
-                        while(src[cur_args_str_pos] != ')' || nested_parentheses != 0)
-                        {
-                            switch(src[cur_args_str_pos++])
-                            {
-                                case '(':
-                                    nested_parentheses++;
-                                    cur_arg_length++;
-                                    break;
-                                case ',':
-                                    if(nested_parentheses == 0)
-                                    {
-                                        args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
-                                        cur_arg_pos    = cur_args_str_pos;
-                                        cur_arg_length = 0;
-                                    }
-                                    else
-                                    {
-                                        cur_arg_length++;
-                                    }
-                                    break;
-                                case ' ':
-                                case '\t':
-                                    if(cur_arg_length == 0)
-                                    {
-                                        cur_arg_pos++;
-                                    }
-                                    else
-                                    {
-                                        cur_arg_length++;
-                                    }
-                                    break;
-                                case ')':
-                                    nested_parentheses--;
-                                // no break here!
-                                default:
-                                    cur_arg_length++;
-                                    break;
-                            }
-                            args_str_length++;
-                        }
-                        if(src[cur_args_str_pos] == ')' && nested_parentheses == 0)
-                        {
-                            args_list.push_back(src.substr(cur_arg_pos, cur_arg_length));
-                        }
-                        args_str_length++;
-                    }
-
-                    std::string                    expanded_content = match.str();
-                    const std::vector<std::string> macro_param_list = macro_definitions.at(match.str()).param_list;
-
-                    if((nested_level != 0 || !macro_param_list.empty()) && macro_param_list.size() == args_list.size())
-                    {
-                        parsed_pos += args_str_length;
-                        expanded_content = macro_definitions.at(match.str()).content;
-                        size_t i         = 0;
-                        for(auto &param_name : macro_param_list)
-                        {
-                            std::regex params_regex(R"(\b)" + param_name + R"(\b)");
-                            expanded_content.assign(std::regex_replace(expanded_content, params_regex, args_list[i]));
-                            ++i;
-                        }
-                        // Expand macro recursively
-                        expanded_content = cs_parser(expanded_content, stage, nested_level + 1);
-
-                        if(nested_level == 0)
-                        {
-                            const std::regex token_pasting_rgx = std::regex(R"(\b##\b)");
-                            if(std::regex_search(expanded_content, token_pasting_rgx))
-                            {
-                                // Remove token pasting operator "##"
-                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(token_pasting_rgx), ""));
-                                // Trim trailing whitespace
-                                expanded_content.assign(std::regex_replace(expanded_content, std::regex(R"([ \t]*\\\n)"), "\n"));
-                            }
-                            else
-                            {
-                                // Do not expand the macro if the result does not have token pasting operator "##"
-                                expanded_content = src.substr(match.position(), match.length() + args_str_length);
-                            }
-                        }
-                    }
-                    dst.append(expanded_content);
-                    break;
-                }
                 case ParserStage::SKIP_COMMENTS:
-                case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES:
                 default:
                     dst.append(match.str());
                     break;
@@ -602,11 +463,7 @@
         ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
     }
 
-    //       We should do the preprocess at compile time
-    //       The preprocess_shader function is used for support "#include" directive and token pasting operator "##".
-    //       This job could be done at compile time by using a python script in order to get better performance at runtime.
-    //       BTW: We usually defined EMBEDDED_KERNELS in release build.
-    program = GCProgram(program_name, preprocess_shader(program_source_it->second));
+    program = GCProgram(program_name, program_source_it->second);
 #else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _shader_path + program_name;
@@ -626,59 +483,6 @@
     return new_program.first->second;
 }
 
-void GCKernelLibrary::setup_context()
-{
-    EGLBoolean res;
-    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-
-    ARM_COMPUTE_ERROR_ON_MSG(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
-
-    res = eglInitialize(_display, nullptr, nullptr);
-
-    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
-    ARM_COMPUTE_UNUSED(egl_extension_st);
-
-    const EGLint config_attribs[] =
-    {
-        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
-        EGL_NONE
-    };
-    EGLConfig cfg;
-    EGLint    count;
-
-    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
-
-    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglBindAPI(EGL_OPENGL_ES_API);
-
-    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
-
-    const EGLint attribs[] =
-    {
-        EGL_CONTEXT_CLIENT_VERSION, 3,
-        EGL_NONE
-    };
-    _context = eglCreateContext(_display,
-                                cfg,
-                                EGL_NO_CONTEXT,
-                                attribs);
-
-    ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
-
-    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-}
-
 void GCKernelLibrary::setup_dummy_fbo()
 {
     ARM_COMPUTE_GL_CHECK(glGenFramebuffers(1, &_frame_buffer));
@@ -700,15 +504,6 @@
     ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, 0));
     ARM_COMPUTE_GL_CHECK(glDeleteTextures(1, &_tex_rt));
     ARM_COMPUTE_GL_CHECK(glDeleteFramebuffers(1, &_frame_buffer));
-
-    if(_own_context)
-    {
-        eglDestroyContext(_display, _context);
-        eglTerminate(_display);
-
-        _context = EGL_NO_CONTEXT;
-        _display = EGL_NO_DISPLAY;
-    }
 }
 
 std::string GCKernelLibrary::stringify_set(const StringSet &s) const

diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index c60c167..55b7f0d 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,7 @@
 }
 
 IGCKernel::IGCKernel()
-    : _kernel()
+    : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U))
 {
 }
 
@@ -79,7 +79,7 @@
 }
 
 template <unsigned int dimension_size>
-void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
+void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
 
@@ -103,7 +103,6 @@
     }
 
     _kernel.set_argument(idx++, offset_first_element);
-    _kernel.set_argument(idx++, param.buffer_data_type_shift);
 
     // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
     unsigned int idx_end = ceil_to_multiple(idx, 4);
@@ -113,7 +112,7 @@
     }
     idx = idx_end;
 
-    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, param.binding_point, tensor->gc_buffer()));
+    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding_point, tensor->gc_buffer()));
 
     ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
                              "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
@@ -122,32 +121,17 @@
 
 void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
 {
-    add_tensor_argument<1>(idx, tensor, BufferParam(binding_point, 0), window);
-}
-
-void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
-{
-    add_tensor_argument<1>(idx, tensor, param, window);
+    add_tensor_argument<1>(idx, tensor, binding_point, window);
 }
 
 void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
 {
-    add_tensor_argument<2>(idx, tensor, BufferParam(binding_point, 0), window);
-}
-
-void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
-{
-    add_tensor_argument<2>(idx, tensor, param, window);
+    add_tensor_argument<2>(idx, tensor, binding_point, window);
 }
 
 void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
 {
-    add_tensor_argument<3>(idx, tensor, BufferParam(binding_point, 0), window);
-}
-
-void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam &param, const Window &window)
-{
-    add_tensor_argument<3>(idx, tensor, param, window);
+    add_tensor_argument<3>(idx, tensor, binding_point, window);
 }
 
 unsigned int IGCKernel::num_arguments_per_1D_tensor() const

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index 38ba183..7d3f4ee 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs

@@ -23,7 +23,7 @@
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
 #ifdef DATA_TYPE_FP32
 precision highp float;
@@ -114,47 +114,35 @@
     return MLA_OP(float(B_VAL), float(A_VAL), x);
 }
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-#ifdef DATA_TYPE_FP32
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-
 /** This performs an activation function floating point inputs.
  *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
  * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
  * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    float data     = src_ptr[src.current_offset];
+    float data     = LOAD_CURRENT_ITEM(src_ptr, src_iter);
     float data_out = 0.f;
     // Perform activation
-
 #ifdef LOGISTIC
     data_out = logistic_op(data);
 #elif defined(TANH)     /*LOGISTIC*/
@@ -181,44 +169,22 @@
 #error Activation function not provided
 #endif /*LOGISTIC*/
 
-    dst_ptr[dst.current_offset] = data_out;
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, data_out);
 }
 
 #elif defined(DATA_TYPE_FP16)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
 
-/** This performs an activation function floating point inputs.
- *
- * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
- * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      ride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
 void main(void)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uint data = src_ptr[src.current_offset >> 2];
+    vec2 data = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
     // Perform activation
-    float a = unpackHalf2x16(data).x;
-    float b = unpackHalf2x16(data).y;
+    float a = data.x;
+    float b = data.y;
     vec2  data_out;
 #ifdef LOGISTIC         /*LOGISTIC*/
     data_out.x = logistic_op(a);
@@ -257,6 +223,6 @@
 #error Activation function not provided
 #endif /*LOGISTIC*/
 
-    dst_ptr[dst.current_offset >> 2] = packHalf2x16(data_out);
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data_out);
 }
-#endif /*DATA_TYPE_FP32*/
+#endif /*DATA_TYPE_FP16*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
new file mode 100755
index 0000000..0ff4360
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+precision mediump float;
+#define ADD(x, y) (x) + (y)
+
+/** This function add two images.
+ *
+ * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: F16
+ * @param[in]  src1_attrs The attributes of the first source image
+ * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source image
+ * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination image
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src1_attrs;
+    ImageAttributes src2_attrs;
+    ImageAttributes dst_attrs;
+};
+
+TENSOR_DECLARATION(1, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
+TENSOR_DECLARATION(2, src2Buffer, uvec4, src2_ptr, src2_shift, 4, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+
+void main(void)
+{
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
+    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    vec4 tmp1[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
+    vec4 tmp2[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src2_ptr, src2_iter);
+    vec4 addition[2];
+    addition[0] = ADD(tmp1[0], tmp2[0]);
+    addition[1] = ADD(tmp1[1], tmp2[1]);
+
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, addition);
+}
\ No newline at end of file

diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 65000f2..0c8b5bf 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs

@@ -22,85 +22,51 @@
  * SOFTWARE.
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
 
-#ifdef DATA_TYPE_FP32
-precision highp float;
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif /*DATA_TYPE_FP16*/
 
 /** This kernel concatenates the input tensor into the output tensor along the third dimension
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ *
+ * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    dst_ptr[dst.current_offset + uint(OFFSETS_Z >> 2)] = src_ptr[tensor3D_offset(src, -OFFSETS_X, -OFFSETS_Y, 0)];
+    float tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
+    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
 }
 
 #elif defined(DATA_TYPE_FP16)
-precision mediump float;
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-
-/** This kernel concatenates the input tensor into the output tensor along the third dimension
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
 void main(void)
 {
-    Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uvec2 packed_s;
-    GC_LOAD1_3D_OFFSET(packed_s, src, -OFFSETS_X, -OFFSETS_Y, 0);
-    dst_ptr[(dst.current_offset + uint(OFFSETS_Z)) >> 3] = packed_s;
+    uvec2 tmp = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, -OFFSETS_X, -OFFSETS_Y, 0));
+    STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, OFFSETS_Z), tmp);
 }
-#endif /*DATA_TYPE_FP32*/
\ No newline at end of file
+#endif /*DATA_TYPE_FP16*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 9976368..344d480 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,80 +23,562 @@
  */
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
 
-layout(std140) uniform shader_params
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+#ifdef RESHAPE_TO_COLUMNS
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr       Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs     The attributes of the source tensor
+ * @param[out] dst_ptr       Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs     The attributes of the destination tensor
+ * @param[in]  biases_ptr    Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs  The attributes of the biases tensor
+ * @param[in]  width         The width of the input tensor
+ * @param[in]  height        The height of the input tensor
+ * @param[in]  depth         The depth of the input tensor
+ * @param[in]  total_filters Total number of filters. 4th dimension of the weights matrix
+ */
+
+SHADER_PARAMS_DECLARATION
 {
-#ifdef IM2COL_GENERIC
-    TENSOR3D_PARAM_DECLARATION(src);
-    IMAGE_PARAM_DECLARATION(dst);
-    uint filter_depth;
-    uint src_stride_w;
-    uint dst_stride_w;
-#endif // IM2COL_GENERIC
-
-#ifdef IM2COL_REDUCED
-    TENSOR3D_PARAM_DECLARATION(src);
-    VECTOR_PARAM_DECLARATION(dst);
+    Tensor3DAttributes src_attrs;
+    ImageAttributes    dst_attrs;
+#ifdef HAS_BIAS
+    VectorAttributes biases_attrs;
+#endif /* HAS_BIAS */
     uint width;
     uint height;
-#endif // IM2COL_REDUCED
-
-#ifdef COL2IM
-    IMAGE_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    uint width;
-#endif // COL2IM
+    uint depth;
+    uint total_filters;
 };
 
-#ifdef DATA_TYPE_FP16
-#if defined(IM2COL_REDUCED_8X)
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, restrict);
-#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, restrict);
-#else                            /* IM2COL_REDUCED_8X */
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, restrict);
-#endif                           /* IM2COL_REDUCED_8X */
+#if defined(DATA_TYPE_FP16)
 
-precision mediump float;
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+#ifdef HAS_BIAS
+TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+void main()
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+#ifdef HAS_BIAS
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
+                           && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
+                                                    gl_GlobalInvocationID.z)
+                                                * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
+    // Linearize convolution elements
+    if(is_last_thread)
+    {
+        for(uint i = 0u; i < uint(total_filters); i = i + 2u)
+        {
+            vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+            vec2 s;
+            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
+            {
+                s.x = s0.x;
+            }
+            else
+            {
+                s.x = s0.y;
+            }
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+
+            vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
+            {
+                s.y = s1.x;
+            }
+            else
+            {
+                s.y = s1.y;
+            }
+            STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+#ifdef HAS_BIAS
+            vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
+            STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x));
+#endif /* HAS_BIAS */
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
+        }
+    }
+    else
+    {
+        for(uint i = 0u; i < uint(total_filters); i = i + 2u)
+        {
+            vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+            vec2 s;
+            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
+            {
+                s.x = s0.x;
+            }
+            else
+            {
+                s.x = s0.y;
+            }
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+
+            vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
+            {
+                s.y = s1.x;
+            }
+            else
+            {
+                s.y = s1.y;
+            }
+            STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
+        }
+    }
+}
+
+#endif /* DATA_TYPE_FP16 */
+#endif // RESHAPE_TO_COLUMNS
+
+#ifdef IM2COL_GENERIC
+
+/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx"
+ * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx"
+ * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
+ * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
+ * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs    The attributes of the source tensor
+ * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs    The attributes of the destination tensor
+ * @param[in]  src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    ImageAttributes    dst_attrs;
+    uint               src_stride_w;
+    uint               dst_stride_w;
+};
+
+#ifdef DATA_TYPE_FP32
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
+
+void main(void)
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    uint xc    = gl_GlobalInvocationID.x;                // x coordinate in the convolved tensor
+    uint yc    = gl_GlobalInvocationID.y;                // y coordinate in the convolved tensor
+    uint ch    = gl_GlobalInvocationID.z % KERNEL_DEPTH; // input feature map
+    uint batch = gl_GlobalInvocationID.z / KERNEL_DEPTH; // the batch
+
+    // Calculate input indeces
+    uint xi = xc * uint(STRIDE_X) - uint(PAD_LEFT);
+    uint yi = yc * uint(STRIDE_Y) - uint(PAD_TOP);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * src_attrs.stride_z) + (batch * src_stride_w));
+
+    // Calculate output indeces
+    uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
+    uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * dst_attrs.stride_y) + (batch * dst_stride_w) + xo);
+
+    uint src_pos = 0u;
+
+    // Linearize convolution elements
+    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    {
+        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u))
+        {
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+            STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
+#else  /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f);
+            }
+            else
+            {
+                src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
+                STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
+            }
+#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
+        }
+    }
+
+#ifdef HAS_BIAS
+    if(ch == (uint(KERNEL_DEPTH) - 1))
+    {
+        STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);
+    }
+#endif /* HAS_BIAS */
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
+#ifdef KERNEL_1x1
+
+void main(void)
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    uint xc    = gl_GlobalInvocationID.x;
+    uint yc    = gl_GlobalInvocationID.y;
+    uint zc    = gl_GlobalInvocationID.z;
+    uint ch    = zc % uint(KERNEL_DEPTH); // input feature map
+    uint batch = zc / uint(KERNEL_DEPTH); // the batch
+
+    // Calculate input indeces
+    uint xi = xc;
+    uint yi = yc;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z);
+
+    // Calculate output indeces
+    uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
+    uint xo                = ch * dst_element_count;
+    uint yo                = xc + yc * uint(CONVOLVED_WIDTH);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
+
+    bool x_start_even = ((xc % 2u) == 0u);
+    bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u);
+    uint input_pos    = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y);
+    uint tmp_left     = 0u;
+    uint tmp_right    = 0u;
+
+    if(ch % 2u != 0u)
+    {
+        return;
+    }
+
+    if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1))))
+    {
+        tmp_left  = LOAD(src_ptr, input_pos);
+        input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z);
+        tmp_right = LOAD(src_ptr, input_pos);
+        if(x_start_even)
+        {
+            tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u);
+        }
+        else
+        {
+            tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u);
+        }
+        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
+
+#ifdef HAS_BIAS
+        if(ch == (uint(KERNEL_DEPTH) - 2u))
+        {
+            mediump vec2 bias_vec = vec2(1.f, 0.f);
+            uint         bias_u   = packHalf2x16(bias_vec);
+            STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u);
+        }
+#endif /* HAS_BIAS */
+    }
+    else
+    {
+        tmp_left = LOAD(src_ptr, input_pos);
+        if(x_start_even)
+        {
+            tmp_right = (tmp_left & 0xffffu);
+        }
+        else
+        {
+            tmp_right = (tmp_left >> 16u);
+        }
+
+#ifdef HAS_BIAS
+        mediump vec2 bias_vec = vec2(0.f, 1.f);
+        uint         bias_u   = packHalf2x16(bias_vec);
+        tmp_right += (bias_u & 0xffff0000u);
+#endif /* HAS_BIAS */
+
+        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+    }
+}
+
+#else /* KERNEL_1x1 */
+
+void main(void)
+{
+    uint xc    = gl_GlobalInvocationID.x;
+    uint yc    = gl_GlobalInvocationID.y;
+    uint zc    = gl_GlobalInvocationID.z;
+    uint ch    = zc % uint(KERNEL_DEPTH); // input feature map
+    uint batch = zc / uint(KERNEL_DEPTH); // the batch
+
+    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    ImageIterator    dst_iter   = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    // Calculate input indeces
+    uint src_element_count = src_attrs.step_x / src_attrs.stride_x;
+    uint xi                = (xc * uint(STRIDE_X)) / src_element_count;
+    uint yi                = yc * uint(STRIDE_Y);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z);
+
+    // Calculate output indeces
+    uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
+    uint xo                = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count;
+    uint yo                = xc + yc * uint(CONVOLVED_WIDTH);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
+
+    bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u);
+    bool z_start_even = ((ch % 2u) == 0u);
+    uint input_pos    = 0u;
+    uint tmp          = 0u;
+    uint tmp_left     = 0u;
+    uint tmp_right    = 0u;
+
+    // Linearize convolution elements
+    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    {
+        uint xstart = 0u;
+        uint xend   = 0u;
+
+        // even col, even row
+        if(x_start_even)
+        {
+            if(((y - yi + ch) % 2u) == 0u)
+            {
+                for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
+                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
+                }
+            }
+            else
+            {
+                // 1st pair
+                if(!z_start_even && (y == yi))
+                {
+                    // cross 2d feature map
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
+                                                               (ch - 1u) * src_attrs.stride_z);
+                }
+                else
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
+                                                               (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
+                }
+                tmp_right = LOAD(src_ptr, input_pos);
+                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
+                tmp_left  = LOAD(src_ptr, input_pos);
+                tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u);
+                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
+
+                // remaining
+                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y);
+                    tmp_left  = LOAD(src_ptr, input_pos);
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
+                    tmp_right = LOAD(src_ptr, input_pos);
+                    tmp_right = (tmp_left >> 16u) + (tmp_right << 16u);
+                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+                }
+            }
+        }
+        else
+        {
+            if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even))
+            {
+                // 1st pair
+                if(y == yi)
+                {
+                    // cross 2d feature map
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
+                                                               (ch - 1u) * src_attrs.stride_z);
+                }
+                else
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
+                                                               (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
+                }
+
+                tmp_right = LOAD(src_ptr, input_pos);
+                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
+                tmp_left  = LOAD(src_ptr, input_pos);
+                tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u);
+                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
+
+                // remaining
+                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
+                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
+                }
+            }
+            else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even))
+            {
+                // 1st pair
+                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
+                tmp_right = LOAD(src_ptr, input_pos);
+                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
+                tmp_left  = LOAD(src_ptr, input_pos);
+                tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
+                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
+
+                // remaining
+                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
+                {
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
+                    tmp_right = LOAD(src_ptr, input_pos);
+                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
+                    tmp_left  = LOAD(src_ptr, input_pos);
+                    tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
+                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
+                }
+            }
+        }
+    }
+
+    // NOTE: must handle last element manually instead of in loops
+    // to avoid write conflict across 2d boundary
+    if(ch == uint(KERNEL_DEPTH) - 1u)
+    {
+        uint x    = xi + (uint(KERNEL_WIDTH) / 2u);
+        uint y    = yi + uint(KERNEL_HEIGHT) - 1u;
+        input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
+        tmp       = LOAD(src_ptr, input_pos);
+        if(!x_start_even)
+        {
+            tmp = (tmp >> 16u) + (tmp << 16u);
+        }
+
+#ifdef HAS_BIAS
+        mediump vec2 bias_vec = vec2(1.f, 1.f);
+        uint         bias_u   = packHalf2x16(bias_vec);
+        if(z_start_even)
+        {
+            tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u);
+        }
+        else
+        {
+            tmp = (bias_u & 0xffffu);
+        }
+#endif /* HAS_BIAS */
+
+        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
+    }
+}
+
+#endif /* KERNEL_1x1 */
+#else  /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */
+#endif /* IM2COL_GENERIC */
 
 #ifdef IM2COL_REDUCED
-#if defined(IM2COL_REDUCED_GENERIC)
+
 /** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
  * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             The width of the input tensor
- * @param[in]  height                            The height of the input tensor
+ * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr   Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination tensor
+ * @param[in]  width     The width of the input tensor
+ * @param[in]  height    The height of the input tensor
  */
+
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    VectorAttributes   dst_attrs;
+    uint               width;
+    uint               height;
+};
+
+#ifdef DATA_TYPE_FP32
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
+
 void main(void)
 {
-    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
-    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
-    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D src_nostep     = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(src);
-    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
-    uint     image_size     = width * height;
-    uint     element_count  = src_step_x / src_stride_x;
-    uint     tmp_out_offset = dst.current_offset + ((pos.x * element_count + pos.y * width + pos.z * image_size) * dst.stride_x);
-    uint     width_fp16     = ((width + uint(1)) >> uint(1));
-    uint     tmp;
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    VectorIterator   dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3 size           = uvec3(gl_WorkGroupSize.xyz);
+    uint  image_size     = width * height;
+    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x + pos.y * width + pos.z * image_size);
+
+    STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter));
+
+#ifdef HAS_BIAS
+    // If it is the last thread in the 3 dimensional workgroup
+    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+    {
+        tmp_out_offset += (dst_attrs.stride_x >> uint(2));
+        STORE(dst_ptr, tmp_out_offset, 1.f);
+    }
+#endif // HAS_BIAS
+}
+
+#elif defined(DATA_TYPE_FP16)
+
+#if defined(IM2COL_REDUCED_8X)
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, restrict);
+#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, restrict);
+#else                            /* IM2COL_REDUCED_8X */
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
+#endif                           /* IM2COL_REDUCED_8X */
+
+#if defined(IM2COL_REDUCED_GENERIC)
+
+void main(void)
+{
+    Tensor3DIterator src_iter        = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator src_nostep_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    VectorIterator   dst_iter        = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
+    uvec3 size           = uvec3(gl_WorkGroupSize.xyz);
+    uint  image_size     = width * height;
+    uint  element_count  = src_attrs.step_x / src_attrs.stride_x;
+    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * element_count + pos.y * width + pos.z * image_size);
+    uint  width_fp16     = (width + uint(1)) >> uint(1);
+    uint  tmp;
 
     // odd width
     if(width % uint(2) != uint(0))
@@ -104,247 +586,149 @@
         // even row
         if((pos.y + pos.z * height) % uint(2) == uint(0))
         {
-            LOAD1(tmp, src, src.current_offset >> uint(2));
-            STORE1(dst, tmp_out_offset >> uint(2), tmp);
+            tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+            STORE(dst_ptr, tmp_out_offset, tmp);
         }
         else
         {
             // special op
-            uint tmpleft  = uint(0);
-            uint tmpright = uint(0);
-            LOAD1(tmpright, src, src.current_offset >> uint(2)); // right half
+            uint tmp_left  = uint(0);
+            uint tmp_right = uint(0);
+            tmp_right      = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half
             if(pos.x == uint(0))
             {
-                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, int(width), int(pos.y) - 1, int(pos.z)) >> uint(2)); // left half
-                tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16));
+                tmp_left  = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half
+                tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16));
             }
             else
             {
-                LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)) >> uint(2)); // left half
-                tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16)));
+                tmp_left  = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)));
+                tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16)));
             }
-            STORE1(dst, tmp_out_offset >> uint(2), tmpright);
+            STORE(dst_ptr, tmp_out_offset, tmp_right);
         }
     }
     else
     {
-        LOAD1(tmp, src, src.current_offset >> uint(2));
-        STORE1(dst, tmp_out_offset >> uint(2), tmp);
-    }
+        tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+        STORE(dst_ptr, tmp_out_offset, tmp);
 
 #ifdef HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
-    {
-        tmp_out_offset += dst.stride_x;
+        // If it is the last thread in the 3 dimensional workgroup
+        if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
+        {
+            tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
 
-        mediump vec2 bias_vec = vec2(1.0f, 1.0f);
-        uint         bias_u   = packHalf2x16(bias_vec);
-        STORE1(dst, tmp_out_offset >> uint(2), bias_u);
-    }
+            mediump vec2 bias_vec = vec2(1.0f, 1.0f);
+            STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
+        }
 #endif // HAS_BIAS
+    }
 }
+
 #else /* IM2COL_REDUCED_GENERIC */
-/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             The width of the input tensor
- * @param[in]  height                            The height of the input tensor
- */
+
 void main(void)
 {
-    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
-    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    VectorIterator   dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
 #if defined(IM2COL_REDUCED_8X)
-    uint     tmp_out_offset = dst.current_offset + ((pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
-    uvec4    tmp;
-    LOAD1(tmp, src, src.current_offset >> uint(4));
-    STORE1(dst, tmp_out_offset >> uint(4), tmp);
+    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE));
+    uvec4 tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+    STORE(dst_ptr, tmp_out_offset, tmp);
 #elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
-    uint  tmp_out_offset = dst.current_offset + ((pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
-    uvec2 tmp;
-    LOAD1(tmp, src, src.current_offset >> uint(3));
-    STORE1(dst, tmp_out_offset >> uint(3), tmp);
+    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE));
+    uvec2 tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+    STORE(dst_ptr, tmp_out_offset, tmp);
 #else                            /* IM2COL_REDUCED_8X */
-    uint tmp_out_offset = dst.current_offset + ((pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE)) * dst.stride_x);
-    uint tmp;
-    LOAD1(tmp, src, src.current_offset >> uint(2));
-    STORE1(dst, tmp_out_offset >> uint(2), tmp);
+    uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE));
+    uint tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+    STORE(dst_ptr, tmp_out_offset, tmp);
 #endif                           /* IM2COL_REDUCED_8X */
 }
-#endif                           /* IM2COL_REDUCED_GENERIC */
-#endif                           // IM2COL_REDUCED
 
-#elif defined(DATA_TYPE_FP32)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, restrict);
+#endif /* IM2COL_REDUCED_GENERIC */
+#else  /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */
+#endif /* IM2COL_REDUCED */
 
-#ifdef IM2COL_GENERIC
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  filter_depth                      The depth of the used filter
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-void main(void)
-{
-    uint xc    = gl_GlobalInvocationID.x;                // x coordinate in the convolved tensor
-    uint yc    = gl_GlobalInvocationID.y;                // y coordinate in the convolved tensor
-    uint ch    = gl_GlobalInvocationID.z % filter_depth; // input feature map
-    uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch
+#ifdef WIDTH_OUTPUT
 
-    // Calculate input indeces
-    uint xi           = xc * uint(STRIDE_X) - uint(PAD_X);
-    uint yi           = yc * uint(STRIDE_Y) - uint(PAD_Y);
-    uint input_offset = (src_offset_first_element_in_bytes + (ch * src_stride_z) + (batch * src_stride_w)) >> uint(2);
-
-    // Calculate output indeces
-    uint xo            = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT);
-    uint yo            = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution
-    uint output_offset = (dst_offset_first_element_in_bytes + (yo * dst_stride_y) + (batch * dst_stride_w) + xo) >> uint(2);
-
-    // Linearize convolution elements
-    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
-    {
-        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x)
-        {
-#if PAD_X == 0 && PAD_Y == 0
-            output_offset = input_offset + ((x * src_stride_x + y * src_stride_y) >> uint(2));
-            STORE4(dst, output_offset, LOAD4(src, input_offset));
-#else  // PAD_X == 0 && PAD_Y == 0
-            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
-            {
-                STORE4(dst, output_offset, 0.0f);
-            }
-            else
-            {
-                output_offset = input_offset + (x * src_stride_x + y * src_stride_y) >> uint(2));
-                STORE4(dst, output_offset, LOAD4(src, input_offset));
-            }
-#endif // PAD_X == 0 && PAD_Y == 0
-        }
-    }
-
-#ifdef HAS_BIAS
-    if(ch == (uint(KERNEL_DEPTH) - 1))
-    {
-        STORE4(dst, output_offset, 1.0f);
-    }
-#endif // HAS_BIAS
-}
-#endif // IM2COL_GENERIC
-
-#ifdef IM2COL_REDUCED
-/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  width                             The width of the input tensor
- * @param[in]  height                            The height of the input tensor
- */
-void main(void)
-{
-    uvec3    pos            = uvec3(gl_GlobalInvocationID.xyz);
-    uvec3    size           = uvec3(gl_WorkGroupSize.xyz);
-    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Vector   dst            = CONVERT_TO_VECTOR_STRUCT_NO_STEP(dst);
-    uint     image_size     = width * height;
-    uint     tmp_out_offset = dst.current_offset + (((pos.x + pos.y * width + pos.z * image_size) * dst.stride_x) >> 2);
-
-    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
-
-#ifdef HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
-    {
-        tmp_out_offset += (dst.stride_x >> uint(2));
-        STORE4(dst, tmp_out_offset, 1.f);
-    }
-#endif // HAS_BIAS
-}
-#endif // IM2COL_REDUCED
-
-#ifdef COL2IM
 /** This kernel performs a reshaping of the output of the convolution layer.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  src_ptr     Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs   The attributes of the source tensor
+ * @param[out] dst_ptr     Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs   The attributes of the destination tensor
+ * @param[in]  dst_depth   The length of the destination tensor in Z dimension
+ * @param[in]  dst_strideZ The actual stride of the destination tensor in Z dimension
  */
+
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    uint               dst_depth;
+    uint               dst_strideZ;
+};
+
+#ifdef DATA_TYPE_FP32
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
+
 void main(void)
 {
-    uvec2    pos = uvec2(gl_GlobalInvocationID.xy);
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    uint idx            = pos.x * dst.stride_z + (pos.y / width) * dst.stride_y + (pos.y % width) * dst.stride_x;
-    uint tmp_out_offset = dst.current_offset + (idx >> 2);
+    uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * WIDTH_OUTPUT * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * (src_attrs.stride_z));
 
-    STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset));
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter,
+                       LOAD_CURRENT_ITEM(src_ptr, src_iter));
 }
-#endif // COL2IM
 
-#else // DATA_TYPE_FP16
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
+
+void main(void)
+{
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+    uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
+
+    if((pos.z % dst_depth) % 2u == 0u)
+    {
+        uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ;
+        uint tmp1_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
+        uint tmp2_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
+        vec2 tmp1                   = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
+        vec2 tmp2                   = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
+        vec2 result                 = vec2(tmp1.x, tmp2.x);
+        STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else
+    {
+        uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u;
+        uint tmp1_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
+        uint tmp2_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
+        vec2 tmp1                   = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
+        vec2 tmp2                   = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
+        vec2 result                 = vec2(tmp1.y, tmp2.y);
+        STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+}
+
+#else /* DATA_TYPE_FP32 */
 #error Data type not supported
-#endif // DATA_TYPE_FP16
+#endif /* DATA_TYPE_FP32 */
+#endif /* COL2IM */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
new file mode 100644
index 0000000..adfc126
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs

@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a depthwise convolution.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized depthwise convolution options for FP16.
+ *       The depthwise convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr       Pointer to the source tensor. Supported data types: F16
+ * @param[in]  src_attrs     The attributes of the source tensor
+ * @param[out] dst_ptr       Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs     The attributes of the destination tensor
+ * @param[in]  weights_ptr   Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs The attributes of the weights tensor
+ * @param[in]  biases_ptr    Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs  The attributes of the weights tensor
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
+#ifdef BIAS
+    VectorAttributes biases_attrs;
+#endif /* BIAS */
+};
+
+#if defined(DATA_TYPE_FP16)
+#if defined(PROCESS_4X_3Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uvec2, weights_ptr, weights_shift, 3, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride1(offset)
+
+vec4 convolve1x3(vec4 s[3], vec4 w)
+{
+    vec4 r;
+
+    r = s[0] * w[0] + s[1] * w[1] + s[2] * w[2];
+
+    return r;
+}
+
+vec4[3] load_unpack_swizzle_stride1(uint offset)
+{
+    vec4 s[2];
+    s = VLOAD2_UNPACK8_HALF(src_ptr, offset);
+
+    vec4 r[3];
+    r[0] = s[0];
+    r[1] = vec4(s[0].yzw, s[1].x);
+    r[2] = vec4(s[0].zw, s[1].xy);
+
+    return r;
+}
+
+void main()
+{
+    Tensor3DIterator src_iter     = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels[3];
+    for(int i = 0; i < 3; i++)
+    {
+        pixels[i] = vec4(0);
+    }
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
+
+    vec4 w[3];
+    w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+    w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+    w[2] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
+
+    vec4 s[3];
+    vec4 r;
+    // first line
+    s = LOAD_UNPACK_SWIZZLE(CURRENT_ITEM_OFFSET(src_iter));
+
+    r = convolve1x3(s, w[0]);
+    pixels[0] += r;
+
+    // second line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 1, 0));
+
+    r = convolve1x3(s, w[1]);
+    pixels[0] += r;
+    r = convolve1x3(s, w[0]);
+    pixels[1] += r;
+
+    // third line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 2, 0));
+
+    r = convolve1x3(s, w[2]);
+    pixels[0] += r;
+    r = convolve1x3(s, w[1]);
+    pixels[1] += r;
+    r = convolve1x3(s, w[0]);
+    pixels[2] += r;
+
+    // forth line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 3, 0));
+
+    r = convolve1x3(s, w[2]);
+    pixels[1] += r;
+    r = convolve1x3(s, w[1]);
+    pixels[2] += r;
+
+    // fifth line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 4, 0));
+
+    r = convolve1x3(s, w[2]);
+    pixels[2] += r;
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    for(int i = 0; i < 3; i++)
+    {
+        pixels[i] += vec4(b);
+    }
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
+}
+#elif defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uvec2, weights_ptr, weights_shift, 3, readonly);
+#ifdef BIAS
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
+#endif /* BIAS */
+
+#if STRIDE_X == 3
+#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride3(offset)
+#elif STRIDE_X == 2
+#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride2(offset)
+#elif STRIDE_X == 1 /* STRIDE_X == 1 */
+#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride1(offset)
+#else /* STRIDE_X not equals 1 or 2 */
+#error STRIDE_X larger than 2 is not supported
+#endif /* STRIDE_X == 2 */
+
+vec4 convolve1x3(vec4 s[3], vec4 w)
+{
+    vec4 r;
+
+    r = s[0] * w[0] + s[1] * w[1] + s[2] * w[2];
+
+    return r;
+}
+
+vec4[3] load_unpack_swizzle_stride1(uint offset)
+{
+    vec4 s[2];
+    s = VLOAD2_UNPACK8_HALF(src_ptr, offset);
+
+    vec4 r[3];
+    r[0] = s[0];
+    r[1] = vec4(s[0].yzw, s[1].x);
+    r[2] = vec4(s[0].zw, s[1].xy);
+
+    return r;
+}
+
+vec4[3] load_unpack_swizzle_stride2(uint offset)
+{
+    vec4 s[3];
+    s[0] = LOAD_UNPACK4_HALF(src_ptr, offset);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(1));
+    s[2] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(2));
+
+    vec4 r[3];
+    r[0] = vec4(s[0].xz, s[1].xz);
+    r[1] = vec4(s[0].yw, s[1].yw);
+    r[2] = vec4(s[0].z, s[1].xz, s[2].x);
+
+    return r;
+}
+
+vec4[3] load_unpack_swizzle_stride3(uint offset)
+{
+    vec4 s[3];
+    s[0] = LOAD_UNPACK4_HALF(src_ptr, offset);
+    s[1] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(1));
+    s[2] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(2));
+
+    vec4 r[3];
+    r[0] = vec4(s[0].xw, s[1].z, s[2].y);
+    r[1] = vec4(s[0].y, s[1].xw, s[2].z);
+    r[2] = vec4(s[0].z, s[1].y, s[2].xw);
+
+    return r;
+}
+
+void main()
+{
+    Tensor3DIterator src_iter     = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 pixels = vec4(0.f);
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
+
+    vec4 w[3];
+    w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
+    w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+    w[2] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
+
+    vec4 s[3];
+    vec4 r;
+    // first line
+    s = LOAD_UNPACK_SWIZZLE(CURRENT_ITEM_OFFSET(src_iter));
+
+    r = convolve1x3(s, w[0]);
+    pixels += r;
+
+    // second line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 1, 0));
+
+    r = convolve1x3(s, w[1]);
+    pixels += r;
+
+    // third line
+    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 2, 0));
+
+    r = convolve1x3(s, w[2]);
+    pixels += r;
+
+#ifdef BIAS
+    vec2  vec2_b;
+    float b;
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+
+    if(z_index % uint(2) == uint(0))
+    {
+        b = vec2_b.x;
+    }
+    else
+    {
+        b = vec2_b.y;
+    }
+
+    pixels += vec4(b);
+#endif /* BIAS */
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
+}
+#endif /* PROCESS_4X_3Y_1Z */
+#endif /* DATA_TYPE_FP16 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index 190d7d6..ea4e9c1 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs

@@ -1005,6 +1005,6 @@
     STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
 }
 #endif /* PROCESS_4X_1Y_1Z */
-#else  /* DATA_TYPE_F32 */
+#else  /* DATA_TYPE_FP32 */
 #error Data type not supported
 #endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index d450ac1..855d450 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs

@@ -23,132 +23,107 @@
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized direct convolution options for FP16.
+ *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ *       This OpenGL ES shader works with stride_x = 1 and 2
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
 #ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
+    VectorAttributes biases_attrs;
 #endif /* BIAS */
     uint weights_stride_w;
     uint weights_depth;
 };
 
-#define LOAD12(r, name, offset)          \
-    r.x = LOAD4(name, offset);           \
-    r.y = LOAD4(name, offset + uint(1)); \
-    r.z = LOAD4(name, offset + uint(2))
-
-#define LOAD3X3(r, name)                                \
-    r[0] = LOAD4(name, tensor3D_offset(name, 0, 0, 0)); \
-    r[1] = LOAD4(name, tensor3D_offset(name, 1, 0, 0)); \
-    r[2] = LOAD4(name, tensor3D_offset(name, 2, 0, 0)); \
-    r[3] = LOAD4(name, tensor3D_offset(name, 0, 1, 0)); \
-    r[4] = LOAD4(name, tensor3D_offset(name, 1, 1, 0)); \
-    r[5] = LOAD4(name, tensor3D_offset(name, 2, 1, 0)); \
-    r[6] = LOAD4(name, tensor3D_offset(name, 0, 2, 0)); \
-    r[7] = LOAD4(name, tensor3D_offset(name, 1, 2, 0)); \
-    r[8] = LOAD4(name, tensor3D_offset(name, 2, 2, 0))
-
-#if defined(PROCESS_1_ELEMENT)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+#if defined(DATA_TYPE_FP32)
+#if defined(PROCESS_1X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    float pixels = CONVERT(0, float);
+    float pixels = 0.f;
 
     uint z_index = gl_GlobalInvocationID.z;
 
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         vec3 temp;
         vec3 w;
 
-        LOAD12(temp, src, offset(src, 0, 0));
-        LOAD12(w, weights, tensor3D_offset(weights, 0, 0, 0));
+        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
 
         pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
 
-        LOAD12(temp, src, offset(src, 0, 1));
-        LOAD12(w, weights, tensor3D_offset(weights, 0, 1, 0));
+        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
 
         pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
 
-        LOAD12(temp, src, offset(src, 0, 2));
-        LOAD12(w, weights, tensor3D_offset(weights, 0, 2, 0));
+        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
 
-        src.current_offset += src_stride_z >> 2;
-        weights.current_offset += weights_stride_z >> 2;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
-    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
-#elif defined(PROCESS_8_ELEMENT)
-BUFFER_DECLARATION(src, 1, vec4, readonly);
-BUFFER_DECLARATION(dst, 2, vec4, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+
+#elif defined(PROCESS_8X_1Y_1Z)
+
+TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #if STRIDE_X == 2
@@ -166,7 +141,7 @@
     vec4 tmp[3];
     vec4 r[2];
 
-    LOAD3(tmp, src, offset);
+    tmp = VLOAD3(vec4[3], src_ptr, offset);
 
     middle = vec4(tmp[0].yzw, tmp[1].x);
     right  = vec4(tmp[0].zw, tmp[1].xy);
@@ -186,73 +161,37 @@
     vec4 left;
     vec4 middle;
     vec4 right;
-    vec4 tmp[3];
+    vec4 tmp1[3];
+    vec4 tmp2[2];
     vec4 r[2];
 
-    LOAD3(tmp, src, offset);
+    tmp1 = VLOAD3(vec4[3], src_ptr, offset);
 
-    left   = vec4(tmp[0].xz, tmp[1].xz);
-    middle = vec4(tmp[0].yw, tmp[1].yw);
-    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
+    left   = vec4(tmp1[0].xz, tmp1[1].xz);
+    middle = vec4(tmp1[0].yw, tmp1[1].yw);
+    right  = vec4(tmp1[0].z, tmp1[1].xz, tmp1[2].x);
 
     r[0] = left * w[0] + middle * w[1] + right * w[2];
 
-    LOAD2(tmp, src, offset + ((uint(3) * src_stride_x) >> 2));
+    tmp2 = VLOAD2(vec4[2], src_ptr, offset + uint(3));
 
-    left   = vec4(tmp[2].xz, tmp[0].xz);
-    middle = vec4(tmp[2].yw, tmp[0].yw);
-    right  = vec4(tmp[2].z, tmp[0].xz, tmp[1].x);
+    left   = vec4(tmp1[2].xz, tmp2[0].xz);
+    middle = vec4(tmp1[2].yw, tmp2[0].yw);
+    right  = vec4(tmp1[2].z, tmp2[0].xz, tmp2[1].x);
 
     r[1] = left * w[0] + middle * w[1] + right * w[2];
 
     return r;
 }
 
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     vec4 pixels[2];
@@ -260,8 +199,7 @@
     pixels[1] = vec4(0);
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
@@ -270,45 +208,46 @@
         vec4 r[2];
 
         // first line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
 
-        r = CONVOLVE1x3(src.current_offset >> uint(2), w);
+        r = CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
         pixels[0] += r[0];
         pixels[1] += r[1];
 
         // second line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
 
-        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+        r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
         pixels[0] += r[0];
         pixels[1] += r[1];
 
         // third line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
-        r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
+        r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
         pixels[0] += r[0];
         pixels[1] += r[1];
 
-        src.current_offset += src_stride_z >> 2;
-        weights.current_offset += weights_stride_z >> 2;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    float b;
-    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
     pixels[0] += vec4(b);
     pixels[1] += vec4(b);
 #endif /* BIAS */
 
-    STORE2(dst, dst.current_offset >> uint(2), pixels);
+    VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
-#elif defined(PROCESS_4_ELEMENT)
-BUFFER_DECLARATION(src, 1, vec4, readonly);
-BUFFER_DECLARATION(dst, 2, vec4, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+
+#elif defined(PROCESS_4X_1Y_1Z)
+
+TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #if STRIDE_X == 2
@@ -325,7 +264,7 @@
     vec4 middle;
     vec4 right;
 
-    LOAD2(tmp, src, offset);
+    tmp = VLOAD2(vec4[2], src_ptr, offset);
 
     middle = vec4(tmp[0].yzw, tmp[1].x);
     right  = vec4(tmp[0].zw, tmp[1].xy);
@@ -343,7 +282,7 @@
 
     vec4 tmp[3];
 
-    LOAD3(tmp, src, offset);
+    tmp = VLOAD3(vec4[3], src_ptr, offset);
 
     left   = vec4(tmp[0].xz, tmp[1].xz);
     middle = vec4(tmp[0].yw, tmp[1].yw);
@@ -354,59 +293,21 @@
     return tmp[0];
 }
 
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     vec4 pixels;
-    pixels = vec4(0);
+    pixels = vec4(0.f);
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
@@ -414,38 +315,36 @@
         vec3 w;
 
         // first line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0));
-
-        pixels += CONVOLVE1x3(src.current_offset >> uint(2), w);
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
+        pixels += CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
 
         // second line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0));
-
-        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w);
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
 
         // third line
-        LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0));
+        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
+        pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
 
-        pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w);
-
-        src.current_offset += src_stride_z >> 2;
-        weights.current_offset += weights_stride_z >> 2;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    float b;
-    LOAD1(b, biases, vector_offset(biases, int(z_index)));
-    pixels += vec4(b);
+    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+    pixels += b;
 #endif /* BIAS */
 
-    STORE1(dst, dst.current_offset >> uint(2), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS)
-BUFFER_DECLARATION(src, 1, vec4, readonly);
-BUFFER_DECLARATION(dst, 2, vec4, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+
+#elif defined(PROCESS_4X_3Y_1Z)
+
+TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
@@ -459,51 +358,14 @@
     return r;
 }
 
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     vec4 pixels[3];
@@ -512,36 +374,35 @@
     pixels[2] = vec4(0);
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         // load 3 weights once
         vec3 w[3];
 
-        LOAD3(w[0], weights, tensor3D_offset(weights, 0, 0, 0));
-        LOAD3(w[1], weights, tensor3D_offset(weights, 0, 1, 0));
-        LOAD3(w[2], weights, tensor3D_offset(weights, 0, 2, 0));
+        w[0] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
+        w[1] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        w[2] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         vec4 s[2];
         vec4 middle;
         vec4 right;
         // first line
-        LOAD2(s, src, src.current_offset >> uint(2));
+        s      = VLOAD2_CURRENT_ITEM(vec4[2], src_ptr, src_iter);
         middle = vec4(s[0].yzw, s[1].x);
         right  = vec4(s[0].zw, s[1].xy);
         pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
 
         // second line
-        LOAD2(s, src, (src.current_offset + (src_stride_y >> 2)) >> uint(2));
+        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
         middle = vec4(s[0].yzw, s[1].x);
         right  = vec4(s[0].zw, s[1].xy);
         pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
         pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
 
         // third line
-        LOAD2(s, src, (src.current_offset + (src_stride_y >> 1)) >> uint(2));
+        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
         middle = vec4(s[0].yzw, s[1].x);
         right  = vec4(s[0].zw, s[1].xy);
         pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
@@ -549,43 +410,45 @@
         pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
 
         // forth line
-        LOAD2(s, src, (src.current_offset + (uint(3) * (src_stride_y >> 2))) >> uint(2));
+        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
         middle = vec4(s[0].yzw, s[1].x);
         right  = vec4(s[0].zw, s[1].xy);
         pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
         pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
 
         // fifth line
-        LOAD2(s, src, (src.current_offset + (src_stride_y)) >> uint(2));
+        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
         middle = vec4(s[0].yzw, s[1].x);
         right  = vec4(s[0].zw, s[1].xy);
         pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
 
-        src.current_offset += src_stride_z >> 2;
-        weights.current_offset += weights_stride_z >> 2;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    float b;
-    LOAD1(b, biases, vector_offset(biases, int(z_index)));
+    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     pixels[0] += vec4(b);
     pixels[1] += vec4(b);
     pixels[2] += vec4(b);
 #endif /* BIAS */
 
-    STORE1(dst, dst.current_offset >> uint(2), pixels[0]);
-    STORE1(dst, (dst.current_offset + (dst_stride_y >> 2)) >> uint(2), pixels[1]);
-    STORE1(dst, (dst.current_offset + (dst_stride_y >> 1)) >> uint(2), pixels[2]);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
+    STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
 }
-#elif defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#endif // PROCESS_nX_nY
+
+#elif defined(DATA_TYPE_FP16)
+
+#if defined(PROCESS_8X_3Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
@@ -609,14 +472,12 @@
     return r;
 }
 
-vec4[3] load_and_unpack(uint offset)
+vec4[3] vload2_src_unpack12_half(uint offset)
 {
     uvec4 packed_s[2];
     vec4  s[3];
 
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-    ;
+    packed_s = VLOAD2(uvec4[2], src_ptr, offset);
 
     s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
     s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
@@ -625,56 +486,16 @@
     return s;
 }
 
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8x3 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    uvec2 packed_d[2];
-    uvec4 vd;
-
     vec4 pixels[3][2];
     int  i, j;
     for(i = 0; i < 3; i++)
@@ -686,17 +507,16 @@
     }
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         // load 3 weights once
         uvec2 packed_w[3];
 
-        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
-        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
-        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
+        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         vec3 w[3];
         w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
@@ -706,18 +526,16 @@
         uvec4 packed_s[2];
         vec4  s[3];
         vec4  r[2];
-        uint  offset;
+
         // first line
-        offset = src.current_offset >> uint(4);
-        s      = load_and_unpack(offset);
+        s = vload2_src_unpack12_half(CURRENT_ITEM_OFFSET(src_iter));
 
         r = CONVOLVE1x3(s, w[0]);
         pixels[0][0] += r[0];
         pixels[0][1] += r[1];
 
         // second line
-        offset = (src.current_offset + src_stride_y) >> uint(4);
-        s      = load_and_unpack(offset);
+        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 1));
 
         r = CONVOLVE1x3(s, w[1]);
         pixels[0][0] += r[0];
@@ -727,8 +545,7 @@
         pixels[1][1] += r[1];
 
         // third line
-        offset = (src.current_offset + (src_stride_y << 1)) >> uint(4);
-        s      = load_and_unpack(offset);
+        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 2));
 
         r = CONVOLVE1x3(s, w[2]);
         pixels[0][0] += r[0];
@@ -741,8 +558,7 @@
         pixels[2][1] += r[1];
 
         // forth line
-        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(4);
-        s      = load_and_unpack(offset);
+        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 3));
 
         r = CONVOLVE1x3(s, w[2]);
         pixels[1][0] += r[0];
@@ -752,29 +568,28 @@
         pixels[2][1] += r[1];
 
         // fifth line
-        offset = (src.current_offset + (src_stride_y << 2)) >> uint(4);
-        s      = load_and_unpack(offset);
+        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 4));
 
         r = CONVOLVE1x3(s, w[2]);
         pixels[2][0] += r[0];
         pixels[2][1] += r[1];
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
-    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
     for(i = 0; i < 3; i++)
@@ -786,37 +601,25 @@
     }
 #endif /* BIAS */
 
-    packed_d[0] = uvec2(packHalf2x16(pixels[0][0].xy), packHalf2x16(pixels[0][0].zw));
-    packed_d[1] = uvec2(packHalf2x16(pixels[0][1].xy), packHalf2x16(pixels[0][1].zw));
-    vd          = uvec4(packed_d[0], packed_d[1]);
-    STORE1(dst, dst.current_offset >> uint(4), vd);
-
-    packed_d[0] = uvec2(packHalf2x16(pixels[1][0].xy), packHalf2x16(pixels[1][0].zw));
-    packed_d[1] = uvec2(packHalf2x16(pixels[1][1].xy), packHalf2x16(pixels[1][1].zw));
-    vd          = uvec4(packed_d[0], packed_d[1]);
-    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(4), vd);
-
-    packed_d[0] = uvec2(packHalf2x16(pixels[2][0].xy), packHalf2x16(pixels[2][0].zw));
-    packed_d[1] = uvec2(packHalf2x16(pixels[2][1].xy), packHalf2x16(pixels[2][1].zw));
-    vd          = uvec4(packed_d[0], packed_d[1]);
-    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(4), vd);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
 }
-#elif defined(PROCESS_X_4ELEMENTS_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#elif defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #if STRIDE_X == 2
 #define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
-#define LOAD_AND_UNPACK(offset) load_and_unpack_stride2(offset)
+#define LOAD_AND_UNPACK(offset) VLOAD3_UNPACK12_HALF(src_ptr, offset)
 #elif STRIDE_X == 1 /* STRIDE_X == 1 */
 #define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-#define LOAD_AND_UNPACK(offset) load_and_unpack_stride1(offset)
+#define LOAD_AND_UNPACK(offset) VLOAD2_UNPACK8_HALF(src_ptr, offset)
 #else /* STRIDE_X not equals 1 or 2 */
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 2 */
@@ -851,81 +654,14 @@
     return r;
 }
 
-vec4[2] load_and_unpack_stride1(uint offset)
-{
-    uvec2 packed_s[2];
-    vec4  s[2];
-
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-
-    return s;
-}
-
-vec4[3] load_and_unpack_stride2(uint offset)
-{
-    uvec2 packed_s[3];
-    vec4  s[3];
-
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-    LOAD1(packed_s[2], src, offset + uint(2));
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-    s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
-
-    return s;
-}
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
     uvec2 packed_d;
@@ -933,17 +669,16 @@
     vec4 pixels = vec4(0);
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         // load 3 weights once
         uvec2 packed_w[3];
 
-        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
-        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
-        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
+        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         vec3 w[3];
         w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
@@ -958,57 +693,50 @@
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 2 */
         vec4 r;
-        uint offset;
-        // first line
-        offset = src.current_offset >> uint(3);
-        s      = LOAD_AND_UNPACK(offset);
 
+        // first line
+        s = LOAD_AND_UNPACK(CURRENT_ITEM_OFFSET(src_iter));
         pixels += CONVOLVE1x3(s, w[0]);
 
         // second line
-        offset = (src.current_offset + src_stride_y) >> uint(3);
-        s      = LOAD_AND_UNPACK(offset);
-
+        s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 1));
         pixels += CONVOLVE1x3(s, w[1]);
 
         // third line
-        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
-        s      = LOAD_AND_UNPACK(offset);
-
+        s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 2));
         pixels += CONVOLVE1x3(s, w[2]);
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
-    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
     pixels += vec4(b);
 #endif /* BIAS */
 
-    packed_d = uvec2(packHalf2x16(pixels.xy), packHalf2x16(pixels.zw));
-    STORE1(dst, dst.current_offset >> uint(3), packed_d);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
 }
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#elif defined(PROCESS_4X_3Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
@@ -1027,69 +755,16 @@
     return r;
 }
 
-vec4[2] load_and_unpack(uint offset)
-{
-    uvec2 packed_s[2];
-    vec4  s[2];
-
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-
-    return s;
-}
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    uvec2 packed_d;
-
     vec4 pixels[3];
     int  i;
 
@@ -1099,17 +774,16 @@
     }
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         // load 3 weights once
         uvec2 packed_w[3];
 
-        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
-        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
-        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
+        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         vec3 w[3];
         w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
@@ -1118,57 +792,47 @@
 
         vec4 s[2];
         vec4 r;
-        uint offset;
-        // first line
-        offset = src.current_offset >> uint(3);
-        s      = load_and_unpack(offset);
 
+        // first line
+        s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
         pixels[0] += CONVOLVE1x3(s, w[0]);
 
         // second line
-        offset = (src.current_offset + src_stride_y) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
         pixels[0] += CONVOLVE1x3(s, w[1]);
         pixels[1] += CONVOLVE1x3(s, w[0]);
 
         // third line
-        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
         pixels[0] += CONVOLVE1x3(s, w[2]);
         pixels[1] += CONVOLVE1x3(s, w[1]);
         pixels[2] += CONVOLVE1x3(s, w[0]);
 
         // forth line
-        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
         pixels[1] += CONVOLVE1x3(s, w[2]);
         pixels[2] += CONVOLVE1x3(s, w[1]);
 
         // fifth line
-        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
         pixels[2] += CONVOLVE1x3(s, w[2]);
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
-    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
     for(i = 0; i < 3; i++)
@@ -1177,23 +841,17 @@
     }
 #endif /* BIAS */
 
-    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
-    STORE1(dst, dst.current_offset >> uint(3), packed_d);
-
-    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
-    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
-
-    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
-    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
 }
-#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
-precision mediump float;
 
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#elif defined(PROCESS_4X_4Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
@@ -1212,69 +870,16 @@
     return r;
 }
 
-vec4[2] load_and_unpack(uint offset)
-{
-    uvec2 packed_s[2];
-    vec4  s[2];
-
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-
-    return s;
-}
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x4 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    uvec2 packed_d;
-
     vec4 pixels[4];
     int  i;
 
@@ -1284,17 +889,16 @@
     }
 
     uint z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         // load 3 weights once
         uvec2 packed_w[3];
 
-        LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
-        LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
-        LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+        packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
+        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
         vec3 w[3];
         w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
@@ -1303,65 +907,53 @@
 
         vec4 s[2];
         vec4 r;
-        uint offset;
-        // first line
-        offset = src.current_offset >> uint(3);
-        s      = load_and_unpack(offset);
 
+        // first line
+        s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
         pixels[0] += CONVOLVE1x3(s, w[0]);
 
         // second line
-        offset = (src.current_offset + src_stride_y) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
         pixels[0] += CONVOLVE1x3(s, w[1]);
         pixels[1] += CONVOLVE1x3(s, w[0]);
 
         // third line
-        offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
         pixels[0] += CONVOLVE1x3(s, w[2]);
         pixels[1] += CONVOLVE1x3(s, w[1]);
         pixels[2] += CONVOLVE1x3(s, w[0]);
 
         // forth line
-        offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
         pixels[1] += CONVOLVE1x3(s, w[2]);
         pixels[2] += CONVOLVE1x3(s, w[1]);
         pixels[3] += CONVOLVE1x3(s, w[0]);
 
         // fifth line
-        offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
         pixels[2] += CONVOLVE1x3(s, w[2]);
         pixels[3] += CONVOLVE1x3(s, w[1]);
 
         // sixth line
-        offset = (src.current_offset + uint(5) * (src_stride_y)) >> uint(3);
-        s      = load_and_unpack(offset);
-
+        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 5));
         pixels[3] += CONVOLVE1x3(s, w[2]);
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
-    LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
     if(z_index % uint(2) == uint(0))
     {
-        b = unpackHalf2x16(packed_b).x;
+        b = vec2_b.x;
     }
     else
     {
-        b = unpackHalf2x16(packed_b).y;
+        b = vec2_b.y;
     }
 
     for(i = 0; i < 4; i++)
@@ -1370,26 +962,17 @@
     }
 #endif /* BIAS */
 
-    packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
-    STORE1(dst, dst.current_offset >> uint(3), packed_d);
-
-    packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
-    STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
-
-    packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
-    STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
-
-    packed_d = uvec2(packHalf2x16(pixels[3].xy), packHalf2x16(pixels[3].zw));
-    STORE1(dst, (dst.current_offset + uint(3) * (dst_stride_y)) >> uint(3), packed_d);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
+    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels[3]);
 }
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
-precision mediump float;
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#elif defined(PROCESS_4X_3Y_2Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
 #define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
@@ -1408,85 +991,31 @@
     return r;
 }
 
-vec4[2] load_and_unpack(uint offset)
-{
-    uvec2 packed_s[2];
-    vec4  s[2];
-
-    LOAD1(packed_s[0], src, offset);
-    LOAD1(packed_s[1], src, offset + uint(1));
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-
-    return s;
-}
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3x2 elements at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT_FP16(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   biases  = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    uvec2 packed_d;
-
     vec4 pixels[3];
     int  i;
 
     uint z_base_index = gl_GlobalInvocationID.z << 1;
 
     // store orginal src current offset
-    uint s_offset = src.current_offset;
+    uint s_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(srcc_iter);
 
-    weights.current_offset += z_base_index * weights_stride_w;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
 
     for(int z = 0; z < 2; ++z)
     {
         uint z_index = z_base_index + uint(z);
 
-        src.current_offset = s_offset;
-        //weights.current_offset = z_index * weights_stride_w;
+        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, s_offset_in_bytes);
 
         for(i = 0; i < 3; i++)
         {
@@ -1498,9 +1027,9 @@
             // load 3 weights once
             uvec2 packed_w[3];
 
-            LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2);
-            LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2);
-            LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2);
+            packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
+            packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
+            packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
 
             vec3 w[3];
             w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
@@ -1509,57 +1038,47 @@
 
             vec4 s[2];
             vec4 r;
-            uint offset;
-            // first line
-            offset = src.current_offset >> uint(3);
-            s      = load_and_unpack(offset);
 
+            // first line
+            s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
             pixels[0] += CONVOLVE1x3(s, w[0]);
 
             // second line
-            offset = (src.current_offset + src_stride_y) >> uint(3);
-            s      = load_and_unpack(offset);
-
+            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
             pixels[0] += CONVOLVE1x3(s, w[1]);
             pixels[1] += CONVOLVE1x3(s, w[0]);
 
             // third line
-            offset = (src.current_offset + (src_stride_y << 1)) >> uint(3);
-            s      = load_and_unpack(offset);
-
+            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
             pixels[0] += CONVOLVE1x3(s, w[2]);
             pixels[1] += CONVOLVE1x3(s, w[1]);
             pixels[2] += CONVOLVE1x3(s, w[0]);
 
             // forth line
-            offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3);
-            s      = load_and_unpack(offset);
-
+            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
             pixels[1] += CONVOLVE1x3(s, w[2]);
             pixels[2] += CONVOLVE1x3(s, w[1]);
 
             // fifth line
-            offset = (src.current_offset + (src_stride_y << 2)) >> uint(3);
-            s      = load_and_unpack(offset);
-
+            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
             pixels[2] += CONVOLVE1x3(s, w[2]);
 
-            src.current_offset += src_stride_z;
-            weights.current_offset += weights_stride_z;
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
         }
 
 #ifdef BIAS
-        uint  packed_b;
+        vec2  vec2_b;
         float b;
-        LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2);
+        vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 
         if(z_index % uint(2) == uint(0))
         {
-            b = unpackHalf2x16(packed_b).x;
+            b = vec2_b.x;
         }
         else
         {
-            b = unpackHalf2x16(packed_b).y;
+            b = vec2_b.y;
         }
 
         for(i = 0; i < 3; i++)
@@ -1568,16 +1087,16 @@
         }
 #endif /* BIAS */
 
-        packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw));
-        STORE1(dst, dst.current_offset >> uint(3), packed_d);
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
+        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
+        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
 
-        packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
-        STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d);
-
-        packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw));
-        STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d);
-
-        dst.current_offset += dst_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_stride_z);
     }
 }
-#endif /* PROCESS_1_ELEMENT */
+
+#endif /* PROCESS_nX_nY_nZ */
+
+#else /* DATA_TYPE_FP32 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index a36bd43..c919e4e 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,274 +24,114 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-#ifdef DATA_TYPE_FP32
-
-precision highp float;
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
 
 /** This kernel performs a direct convolution to convolve the low three dimensions
  *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized direct convolution options for FP16.
+ *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ *       This OpenGL ES shader works with stride_x = 1 and 2
  * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
  *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[out] weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
  */
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
 #ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
+    VectorAttributes biases_attrs;
 #endif /* BIAS */
     uint weights_stride_w;
     uint weights_depth;
 };
 
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-#define LOAD20(r, name, offset)           \
-    r[0] = LOAD4(name, offset);           \
-    r[1] = LOAD4(name, offset + uint(1)); \
-    r[2] = LOAD4(name, offset + uint(2)); \
-    r[3] = LOAD4(name, offset + uint(3)); \
-    r[4] = LOAD4(name, offset + uint(4))
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    float pixels  = CONVERT(0, float);
+    float pixels  = 0.f;
     uint  z_index = gl_GlobalInvocationID.z;
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
     float temp[5];
     float temp_weight[5];
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        LOAD20(temp, src, offset(src, 0, 0));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 1));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 2));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 3));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 3, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 4));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 4, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        src.current_offset += (src_stride_z >> 2);
-        weights.current_offset += (weights_stride_z >> 2);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
-    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
-
 #elif defined(DATA_TYPE_FP16)
 
-precision mediump float;
-
-#if defined(PROCESS_4X_1Y_1Z)
-
-/** This kernel performs a direct convolution to convolve the low three dimensions
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
-#endif /* BIAS */
-
+// Common definitions for DATA_TYPE_FP16
 #if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
 #define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
 #elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD3_UNPACK12_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
 #define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
 #else /* STRDIDE_X == 1 */
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 1 */
 
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
+#define LOAD_WEIGHT_AT_ROW(row) VLOAD3_UNPACK6_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, row, 0))
 
 vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
 {
@@ -317,715 +157,57 @@
     return ret;
 }
 
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
+#if defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-    vec4  res = vec4(0);
-    vec2  w[3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
+void main()
+{
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    weights.current_offset += z_index * weights_stride_w;
+#ifdef BIAS
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
+#endif /* BIAS */
+
+    vec4 res = vec4(0);
+    vec2 w[3];
+    vec4 s[STRIDE_X + 1];
+
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         for(int row = 0; row < 5; row++)
         {
-            w = load_weight(weights, row);
-            s = LOAD_SRC(src, row);
+            w = LOAD_WEIGHT_AT_ROW(row);
+            s = LOAD_SRC_AT_ROW(row);
             res += CONVOLVE1x5(s, w);
         }
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
 
-    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+    b      = (z_index % uint(2) == uint(0)) ? vec2_b.x : vec2_b.y;
     res += vec4(b);
 #endif /* BIAS */
 
-    packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
 }
 
-#elif defined(PROCESS_4X_3Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
-    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
-    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
-    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
-    vec4  res[3];
-    vec2  w[5][3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
-    int   i;
-
-    for(i = 0; i < 3; i++)
-    {
-        res[i] = vec4(0);
-    }
-
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load weights once
-        for(int row = 0; row < 5; row++)
-        {
-            w[row] = load_weight(weights, row);
-        }
-
-        // 1st line
-        s = LOAD_SRC(src, 0);
-        res[0] += CONVOLVE1x5(s, w[0]);
-
-        // 2nd line
-        s = LOAD_SRC(src, 1);
-        res[0] += CONVOLVE1x5(s, w[1]);
-        res[1] += CONVOLVE1x5(s, w[0]);
-
-        // 3rd line
-        s = LOAD_SRC(src, 2);
-        res[0] += CONVOLVE1x5(s, w[2]);
-        res[1] += CONVOLVE1x5(s, w[1]);
-        res[2] += CONVOLVE1x5(s, w[0]);
-
-        // 4th line
-        s = LOAD_SRC(src, 3);
-        res[0] += CONVOLVE1x5(s, w[3]);
-        res[1] += CONVOLVE1x5(s, w[2]);
-        res[2] += CONVOLVE1x5(s, w[1]);
-
-        // 5th line
-        s = LOAD_SRC(src, 4);
-        res[0] += CONVOLVE1x5(s, w[4]);
-        res[1] += CONVOLVE1x5(s, w[3]);
-        res[2] += CONVOLVE1x5(s, w[2]);
-
-        // 6th line
-        s = LOAD_SRC(src, 5);
-        res[1] += CONVOLVE1x5(s, w[4]);
-        res[2] += CONVOLVE1x5(s, w[3]);
-
-        // 7th line
-        s = LOAD_SRC(src, 6);
-        res[2] += CONVOLVE1x5(s, w[4]);
-
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
-    }
-
-#ifdef BIAS
-    uint  packed_b;
-    float b;
-
-    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-    for(i = 0; i < 3; i++)
-    {
-        res[i] += vec4(b);
-    }
-#endif /* BIAS */
-
-    for(i = 0; i < 3; i++)
-    {
-        packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
-        GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
-    }
-}
-
-#elif defined(PROCESS_4X_3Y_2Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
-    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
-    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
-    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
-    vec4  res[3];
-    vec2  w[5][3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index  = (gl_GlobalInvocationID.z);
-    uint  s_offset = src.current_offset;
-    int   i, z;
-
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(z = 0; z < 2; z++)
-    {
-        z_index += uint(z);
-        src.current_offset = s_offset;
-
-        for(i = 0; i < 3; i++)
-        {
-            res[i] = vec4(0);
-        }
-
-        for(int d = 0; d < int(weights_depth); ++d)
-        {
-            // load weights once
-            for(int row = 0; row < 5; row++)
-            {
-                w[row] = load_weight(weights, row);
-            }
-
-            // 1st line
-            s = LOAD_SRC(src, 0);
-            res[0] += CONVOLVE1x5(s, w[0]);
-
-            // 2nd line
-            s = LOAD_SRC(src, 1);
-            res[0] += CONVOLVE1x5(s, w[1]);
-            res[1] += CONVOLVE1x5(s, w[0]);
-
-            // 3rd line
-            s = LOAD_SRC(src, 2);
-            res[0] += CONVOLVE1x5(s, w[2]);
-            res[1] += CONVOLVE1x5(s, w[1]);
-            res[2] += CONVOLVE1x5(s, w[0]);
-
-            // 4th line
-            s = LOAD_SRC(src, 3);
-            res[0] += CONVOLVE1x5(s, w[3]);
-            res[1] += CONVOLVE1x5(s, w[2]);
-            res[2] += CONVOLVE1x5(s, w[1]);
-
-            // 5th line
-            s = LOAD_SRC(src, 4);
-            res[0] += CONVOLVE1x5(s, w[4]);
-            res[1] += CONVOLVE1x5(s, w[3]);
-            res[2] += CONVOLVE1x5(s, w[2]);
-
-            // 6th line
-            s = LOAD_SRC(src, 5);
-            res[1] += CONVOLVE1x5(s, w[4]);
-            res[2] += CONVOLVE1x5(s, w[3]);
-
-            // 7th line
-            s = LOAD_SRC(src, 6);
-            res[2] += CONVOLVE1x5(s, w[4]);
-
-            src.current_offset += src_stride_z;
-            weights.current_offset += weights_stride_z;
-        }
-
-#ifdef BIAS
-        uint  packed_b;
-        float b;
-
-        GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-        b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-        for(i = 0; i < 3; i++)
-        {
-            res[i] += vec4(b);
-        }
-#endif /* BIAS */
-
-        for(i = 0; i < 3; i++)
-        {
-            packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
-            GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
-        }
-
-        dst.current_offset += dst_stride_z;
-    }
-}
-
-#elif defined(PROCESS_8X_1Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
- *
- * @note This OpenGL ES shader works with stride_x = 1
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#error stride == 2 for PROCESS_8X_1Y not implemented
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[3] load_src_stride1(Image src, int row)
-{
-    uvec4 packed[2];
-    vec4  ret[3];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
-    ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret[2];
-
-    ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    src0   = tmp[1];
-    src1   = vec4(tmp[1].yzw, tmp[2].x);
-    src2   = vec4(tmp[1].zw, tmp[2].xy);
-    src3   = vec4(tmp[1].w, tmp[2].xyz);
-    src4   = tmp[2];
-    ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
-    vec4  res[2];
-    vec2  w[3];
-    vec4  s[STRIDE_X + 2];
-    uvec4 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
-
-    res[0] = vec4(0);
-    res[1] = vec4(0);
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        for(int row = 0; row < 5; row++)
-        {
-            w = load_weight(weights, row);
-            s = LOAD_SRC(src, row);
-            res[0] += CONVOLVE1x5(s, w)[0];
-            res[1] += CONVOLVE1x5(s, w)[1];
-        }
-
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
-    }
-
-#ifdef BIAS
-    uint  packed_b;
-    float b;
-
-    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-    res[0] += vec4(b);
-    res[1] += vec4(b);
-#endif /* BIAS */
-
-    packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
-    packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
-}
-
-#else /* defined(PROCESS_4X_1Y_1Z) */
-
-#endif /* defined(PROCESS_4X_1Y_1Z) */
-
-#else /* DATA_TYPE_FP16 */
+#endif /* PROCESS_nX_nY_nZ */
+#else  /* DATA_TYPE_FP32 */
 #error Data type not supported
-#endif /* DATA_TYPE_FP16 */
+#endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
index 54e08b1..8dc7f0a 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,11 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(mask);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif /*DATA_TYPE_FP16*/
 
 uint hash(uint x)
 {
@@ -65,48 +62,34 @@
     return float_construct(hash(floatBitsToUint(v + seed)));
 }
 
-#ifdef DATA_TYPE_FP32
-
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(mask, 2, float, );
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-
 /** Dropout is used to improve over-fit on neural networks.
  *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
  *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
- * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
- * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
- * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
- * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src_ptr    Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs  The attributes of the source tensor
+ * @param[out] mask_ptr   Pointer to the mask tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mask_attrs The attributes of the mask tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes mask_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maskBuffer, float, mask_ptr, mask_shift, 2, );
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
-    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
-    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator mask_iter = CONVERT_TO_TENSOR3D_ITERATOR(mask_attrs, mask_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     float random  = 0.f;
     float inputv  = 0.f;
@@ -116,64 +99,29 @@
 #ifdef FORWARD
     random = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
     maskv  = (random > RATIO) ? 1.f : 0.f;
-    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+    STORE_CURRENT_ITEM(mask_ptr, mask_iter, maskv);
 #else  /* FORWARD */
-    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
+    maskv = LOAD_CURRENT_ITEM(mask_ptr, mask_iter);
 #endif /* FORWARD */
 
-    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
+    inputv  = LOAD_CURRENT_ITEM(src_ptr, src_iter);
     outputv = maskv * inputv * float(SCALE);
-    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, outputv);
 }
 
 #elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maskBuffer, uint, mask_ptr, mask_shift, 2, );
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
 
-precision mediump float;
-
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(mask, 2, uint, );
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-
-/** Dropout is used to improve over-fit on neural networks.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] mask_ptr                           Pointer to the mask tensor. Supported data types: same as @p src_ptr
- * @param[in]  mask_stride_x                      Stride of the mask tensor in X dimension (in bytes)
- * @param[in]  mask_step_x                        mask_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mask_stride_y                      Stride of the mask tensor in Y dimension (in bytes)
- * @param[in]  mask_step_y                        mask_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  mask_stride_z                      Stride of the mask tensor in Z dimension (in bytes)
- * @param[in]  mask_step_z                        mask_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mask_offset_first_element_in_bytes The offset of the first element in the mask tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
 void main(void)
 {
-    Tensor3D src  = GC_CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask);
-    Tensor3D dst  = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator mask_iter = CONVERT_TO_TENSOR3D_ITERATOR(mask_attrs, mask_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     float random1    = 0.f;
     float random2    = 0.f;
-    uint  inputv     = uint(0);
-    uint  outputv    = uint(0);
-    uint  maskv      = uint(0);
     vec2  input_vec  = vec2(0, 0);
     vec2  output_vec = vec2(0, 0);
     vec2  mask_vec   = vec2(0, 0);
@@ -183,20 +131,16 @@
     random2          = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED);
     mask_vec.x       = (random1 > RATIO) ? 1.f : 0.f;
     mask_vec.y       = (random2 > RATIO) ? 1.f : 0.f;
-    maskv            = packHalf2x16(mask_vec);
-    GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0);
+
+    STORE_PACK2_CURRENT_ITEM_HALF(mask_ptr, mask_iter, mask_vec);
 #else  /* FORWARD */
-    GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0);
-    mask_vec = unpackHalf2x16(maskv);
+    mask_vec = LOAD_UNPACK2_CURRENT_ITEM_HALF(mask_ptr, mask_iter);
 #endif /* FORWARD */
 
-    GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0);
-
-    input_vec  = unpackHalf2x16(inputv);
+    input_vec  = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
     output_vec = mask_vec * input_vec * float(SCALE);
-    outputv    = packHalf2x16(output_vec);
 
-    GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0);
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, output_vec);
 }
 
 #else /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
index c64572b..4f87b92 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs

@@ -132,7 +132,7 @@
     ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
 
     // Update pointer to point to the starting point of the valid region
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, uint(start_pos_y) * buf_attrs.stride_y + uint(start_pos_x) * buf_attrs.stride_x);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
 
     int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
     int gid0        = int(gl_GlobalInvocationID.x);
@@ -158,12 +158,29 @@
             {
                 if(pos % 2 == 0)
                 {
-                    STORE_PACK2_HALF(buf_ptr, offset, left_val.xx);
+                    if(BORDER_SIZE_LEFT % 2 == 0)
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, left_val.xx);
+                    }
+                    else
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, left_val.yy);
+                    }
+                    i++;
                 }
             }
         }
         // Handle right border
-        vec2 right_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
+        vec2 right_val_origin = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
+        vec2 right_val;
+        if((((BORDER_SIZE_LEFT + int(width)) % 2)) == 1)
+        {
+            right_val = vec2(right_val_origin.x, right_val_origin.x);
+        }
+        else
+        {
+            right_val = vec2(right_val_origin.y, right_val_origin.y);
+        }
         for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
         {
             uint offset = IMAGE_OFFSET(buf_iter, int(width) + i, gidH);
@@ -173,7 +190,8 @@
             {
                 if(pos % 2 == 0)
                 {
-                    STORE_PACK2_HALF(buf_ptr, offset, right_val.yy);
+                    STORE_PACK2_HALF(buf_ptr, offset, right_val);
+                    i++;
                 }
                 else
                 {
@@ -184,7 +202,8 @@
             {
                 if(pos % 2 == 0)
                 {
-                    STORE_PACK2_HALF(buf_ptr, offset, right_val.yy);
+                    STORE_PACK2_HALF(buf_ptr, offset, right_val);
+                    i++;
                 }
             }
         }
@@ -208,7 +227,14 @@
             {
                 if(gidW == (int(width) - 1))
                 {
-                    STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
+                    if(((BORDER_SIZE_LEFT + int(width)) % 2 == 1))
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
+                    }
+                    else
+                    {
+                        STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
+                    }
                 }
                 else
                 {
@@ -229,6 +255,10 @@
                         {
                             STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
                         }
+                        else
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
+                        }
                     }
                     else
                     {
@@ -268,6 +298,10 @@
                         {
                             STORE_PACK2_HALF(buf_ptr, offset, bottom_val.yy);
                         }
+                        else
+                        {
+                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
+                        }
                     }
                     else
                     {

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index 8cf95af..4c8730e 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,94 +22,64 @@
  * SOFTWARE.
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
 
 #if defined(DATA_TYPE_FP32)
-#define LOAD8(r, name, offset) \
-    r.x = LOAD4(name, offset); \
-    r.y = LOAD4(name, offset + uint(1))
-
-#define LOAD16(r, name, offset)          \
-    r.x = LOAD4(name, offset);           \
-    r.y = LOAD4(name, offset + uint(1)); \
-    r.z = LOAD4(name, offset + uint(2)); \
-    r.w = LOAD4(name, offset + uint(3))
-
-#define STORE16(name, offset, r)         \
-    STORE4(name, offset, r.x);           \
-    STORE4(name, offset + uint(1), r.y); \
-    STORE4(name, offset + uint(2), r.z); \
-    STORE4(name, offset + uint(3), r.w)
-
 #ifdef GEMM_TRANSPOSE1xW
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
 /** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
     /* Compute address for Matrix B - source */
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
     /* Compute address for Matrix B transposed - destination. X and Y are swapped */
-    uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2;
-    vec4 b0;
-    LOAD16(b0, src, offset(src, 0, 0));
-    STORE16(dst, dst_addr_in_bytes, b0);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
+
+    vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, b0);
 }
 #endif /* GEMM_TRANSPOSE1xW */
 
 #ifdef GEMM_INTERLEAVE4x4
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
 /** This OpenGLES kernel reshapes the input matrix interleaving the values
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main(void)
 {
     /* Compute source and destination addresses */
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     int i;
     int j;
@@ -118,102 +88,80 @@
     {
         for(j = 0; j < 4; ++j)
         {
-            float res    = LOAD4(src, offset(src, i, j));
-            uint  ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j);
-            STORE4(dst, ofset0, res);
+            float res = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, j));
+            STORE(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, (i * 4 + j)), res);
         }
     }
 }
 #endif /* GEMM_INTERLEAVE4x4 */
 
 #ifdef GEMM_ACCUMULATE_BIASES
-BUFFER_DECLARATION(accum, 1, float, restrict);
-BUFFER_DECLARATION(biases, 2, float, readonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(accum);
-    VECTOR_PARAM_DECLARATION(biases);
-};
-
 /** This kernel accumulates each row with the biases vector
  *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F32
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in, out] accum_ptr    Pointer to the accumulate tensor. Supported data type: F32
+ * @param[in]      accum_attrs  The attributes of the accumulate tensor
+ * @param[in]      biases_ptr   Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_attrs The attributes of the biases tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes  accum_attrs;
+    VectorAttributes biases_attrs;
+};
+TENSOR_DECLARATION(1, accumBuffer, float, accum_ptr, accum_shift, 2, restrict);
+TENSOR_DECLARATION(2, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
+
 void main(void)
 {
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
 
     for(int i = 0; i < 16; ++i)
     {
-        float accum_value  = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i));
-        float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i));
+        float accum_value  = LOAD(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i));
+        float biases_value = LOAD(biases_ptr, TENSOR_OFFSET_ADVANCE(biases_iter, i));
         accum_value        = biases_value + accum_value;
 
         // Store result in the accummulate buffer
-        STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value);
+        STORE(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i), accum_value);
     }
 }
 #endif /* GEMM_ACCUMULATE_BIASES */
 
 #ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
-BUFFER_DECLARATION(src0, 1, float, readonly);
-BUFFER_DECLARATION(src1, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src0);
-    IMAGE_PARAM_DECLARATION(src1);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
 /** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
  * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
  *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main()
 {
-    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
-    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     /* Compute address for matrix A and B */
-    src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2);
-    src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2);
-
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
     /* Compute end row address for matrix B */
-    int end_row_mtx_b = int(src1.current_offset) + int(COLS_B);
+    int end_row_mtx_b = int(TENSOR_OFFSET_ADVANCE(src1_iter, COLS_B));
 
     /* Reset accumulators */
     vec4 c00 = vec4(0.0f);
@@ -221,13 +169,11 @@
     vec4 c20 = vec4(0.0f);
     vec4 c30 = vec4(0.0f);
 
-    for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8))
+    for(; int(CURRENT_ITEM_OFFSET(src1_iter)) <= (end_row_mtx_b - 8); TENSOR_ITERATOR_ADVANCE(src0_iter, 8), TENSOR_ITERATOR_ADVANCE(src1_iter, 8))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0;
-        vec4 b0;
-        LOAD16(a0, src0, src0.current_offset);
-        LOAD16(b0, src1, src1.current_offset);
+        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
 
         c00 += vec4(a0.x) * b0;
         c10 += vec4(a0.y) * b0;
@@ -235,8 +181,8 @@
         c30 += vec4(a0.w) * b0;
 
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        LOAD16(a0, src0, src0.current_offset + uint(4));
-        LOAD16(b0, src1, src1.current_offset + uint(4));
+        a0 = VLOAD4(vec4, src0_ptr, TENSOR_OFFSET_ADVANCE(src0_iter, 4));
+        b0 = VLOAD4(vec4, src1_ptr, TENSOR_OFFSET_ADVANCE(src1_iter, 4));
 
         c00 += vec4(a0.x) * b0;
         c10 += vec4(a0.y) * b0;
@@ -244,13 +190,11 @@
         c30 += vec4(a0.w) * b0;
     }
 
-    for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4))
+    for(; int(CURRENT_ITEM_OFFSET(src1_iter)) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE(src0_iter, 4), TENSOR_ITERATOR_ADVANCE(src1_iter, 4))
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0;
-        vec4 b0;
-        LOAD16(a0, src0, src0.current_offset);
-        LOAD16(b0, src1, src1.current_offset);
+        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
 
         c00 += vec4(a0.x) * b0;
         c10 += vec4(a0.y) * b0;
@@ -265,62 +209,49 @@
     c30 = c30 * vec4(ALPHA);
 
     /* Store 4x4 block */
-    STORE16(dst, offset(dst, 0, 0), c00);
-    STORE16(dst, offset(dst, 0, 1), c10);
-    STORE16(dst, offset(dst, 0, 2), c20);
-    STORE16(dst, offset(dst, 0, 3), c30);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
 }
 #endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
 
 #ifdef GEMM_MM_FLOATING_POINT
-BUFFER_DECLARATION(src0, 1, float, readonly);
-BUFFER_DECLARATION(src1, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src0);
-    IMAGE_PARAM_DECLARATION(src1);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
 /** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
  * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
  *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main()
 {
-    Image src0 = CONVERT_TO_IMAGE_STRUCT(src0);
-    Image src1 = CONVERT_TO_IMAGE_STRUCT(src1);
-    Image dst  = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
     /* Compute the address for the vector A and matrix B */
-    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2);
-    src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
 
     /* Compute end row address for matrix A */
-    int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2);
+    int end_row_vec_a = int(TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, COLS_A * 4));
 
     /* Reset accumulators */
     vec4 acc0 = vec4(0.0f);
@@ -334,27 +265,21 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2))
+    for(; int(CURRENT_ITEM_OFFSET(src0_iter)) <= (end_row_vec_a - 2); TENSOR_ITERATOR_ADVANCE(src0_iter, 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
     {
-        vec2 a0;
-        LOAD8(a0, src0, src0.current_offset);
+        vec2 a0 = VLOAD2_CURRENT_ITEM(vec2, src0_ptr, src0_iter);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1;
-        LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2)));
+        vec2 a1 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2;
-        LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+        vec2 a2 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3;
-        LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+        vec2 a3 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        vec4 b0;
-        vec4 b1;
-        LOAD16(b0, src1, src1.current_offset);
-        LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2)));
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        vec4 b1 = VLOAD4(vec4, src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
 
         acc0 += b0 * vec4(a0.x);
         acc0 += b1 * vec4(a0.y);
@@ -372,26 +297,22 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     }
 
-    for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2))
+    for(; int(CURRENT_ITEM_OFFSET(src0_iter)) < end_row_vec_a; TENSOR_ITERATOR_ADVANCE(src0_iter, 1), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
     {
         // Load values from matrix A
-        float a0;
-        a0 = LOAD4(src0, src0.current_offset);
+        float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1;
-        a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2)));
+        float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+        //float a1 = 0;
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2;
-        a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2)));
+        float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3;
-        a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2)));
+        float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        vec4 b0;
-        LOAD16(b0, src1, src1.current_offset);
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
 
         acc0 += b0 * vec4(a0);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -407,122 +328,173 @@
 
     /* Multiply by the weight of vector-matrix product */
     acc0 = acc0 * vec4(ALPHA);
-    STORE16(dst, offset(dst, 0, 0), acc0);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
     acc1 = acc1 * vec4(ALPHA);
-    STORE16(dst, offset(dst, 0, 1), acc1);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
     acc2 = acc2 * vec4(ALPHA);
-    STORE16(dst, offset(dst, 0, 2), acc2);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     acc3 = acc3 * vec4(ALPHA);
-    STORE16(dst, offset(dst, 0, 3), acc3);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #endif /* GEMM_MM_FLOATING_POINT */
 
 #ifdef GEMM_MATRIXADDITION
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, restrict);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
 /** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
  * @attention The beta's value need to be passed at compile time using BETA
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
+
 void main(void)
 {
     /* Compute source and destination addresses */
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     /* Load values from A x B */
-    vec4 alpha_ab;
-    vec4 c;
-    vec4 out1;
-
-    LOAD16(alpha_ab, dst, dst.current_offset);
-    LOAD16(c, src, src.current_offset);
+    vec4 alpha_ab = VLOAD4_CURRENT_ITEM(vec4, dst_ptr, dst_iter);
+    vec4 c        = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
 
     /* Computes alpha * axb + beta * c */
-    out1 = alpha_ab + vec4(BETA * c);
+    vec4 out1 = alpha_ab + vec4(float(BETA) * c);
 
     /* Store final result in axb matrix */
-    STORE16(dst, dst.current_offset, out1);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, out1);
 }
 #endif /* GEMM_MATRIXADDITION */
+
 #elif defined(DATA_TYPE_FP16)
-precision mediump float;
-#ifdef GEMM_MM_FLOATING_POINT
-#if defined(MM_PROCESS_4X)
-BUFFER_DECLARATION(src0, 1, uint, readonly);
-BUFFER_DECLARATION(src1, 2, uvec2, readonly);
-BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
 
-layout(std140) uniform shader_params
+#ifdef GEMM_TRANSPOSE1xW
+/** This OpenGL ES kernel computes the "vector" 1x8 transposition of input matrix
+ *
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
 {
-    IMAGE_PARAM_DECLARATION(src0);
-    IMAGE_PARAM_DECLARATION(src1);
-    IMAGE_PARAM_DECLARATION(dst);
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
 };
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+void main(void)
+{
+    /* Compute address for Matrix B - source */
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
+
+    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
+
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
+}
+#endif /* GEMM_TRANSPOSE1xW */
+
+#ifdef GEMM_INTERLEAVE4x4
+/** This OpenGLES kernel reshapes the input matrix interleaving the values
+ *
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+
+void main(void)
+{
+    /* Compute source and destination addresses */
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    vec4 s0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    vec4 s1[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+    vec4 s2[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+    vec4 s3[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
+
+    vec4 s[2];
+    s[0] = vec4(s0[0].x, s1[0].x, s2[0].x, s3[0].x);
+    s[1] = vec4(s0[0].y, s1[0].y, s2[0].y, s3[0].y);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
+
+    s[0] = vec4(s0[0].z, s1[0].z, s2[0].z, s3[0].z);
+    s[1] = vec4(s0[0].w, s1[0].w, s2[0].w, s3[0].w);
+    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 1u), s);
+
+    s[0] = vec4(s0[1].x, s1[1].x, s2[1].x, s3[1].x);
+    s[1] = vec4(s0[1].y, s1[1].y, s2[1].y, s3[1].y);
+    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 2u), s);
+
+    s[0] = vec4(s0[1].z, s1[1].z, s2[1].z, s3[1].z);
+    s[1] = vec4(s0[1].w, s1[1].w, s2[1].w, s3[1].w);
+    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 3u), s);
+}
+#endif /* GEMM_INTERLEAVE4x4 */
+
+#ifdef GEMM_MM_FLOATING_POINT
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
  * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
  *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_ptr   Pointer to the source matrix.Supported data types: F16
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+
+#if defined(MM_PROCESS_4X)
+TENSOR_DECLARATION(1, src0Buffer, uint, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+
 void main()
 {
-    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
-    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
-    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
     /* Compute the address for the vector A and matrix B */
-    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
 
     /* Compute end row address for matrix A */
-    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
 
     /* Reset accumulators */
     vec4 acc0 = vec4(0.0f);
@@ -536,42 +508,22 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(src0.current_offset) < int(end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(2));
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
     {
-        uint packed_a;
-        vec2 a0;
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
-        a0 = vec2(unpackHalf2x16(packed_a));
+        vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1;
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
-        a1 = vec2(unpackHalf2x16(packed_a));
+        vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2;
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
-        a2 = vec2(unpackHalf2x16(packed_a));
+        vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3;
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
-        a3 = vec2(unpackHalf2x16(packed_a));
+        vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        uvec2 packed_b0;
-        uvec2 packed_b1;
-        vec4  b0;
-        vec4  b1;
-
-        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
-        GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1);
-
-        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
-        b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y));
+        vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
+        vec4 b1 = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
 
         acc0 += b0 * vec4(a0.x);
         acc0 += b1 * vec4(a0.y);
@@ -589,38 +541,20 @@
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
     }
 
-    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
     {
-        uint packed_a0;
-        vec2 a0;
-
-        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0);
-        a0 = vec2(unpackHalf2x16(packed_a0));
+        vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1;
-
-        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 1);
-        a1 = vec2(unpackHalf2x16(packed_a0));
+        vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2;
-
-        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 2);
-        a2 = vec2(unpackHalf2x16(packed_a0));
+        vec  a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3;
-
-        GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 3);
-        a3 = vec2(unpackHalf2x16(packed_a0));
+        vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        uvec2 packed_b0;
-        vec4  b0;
-
-        GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0);
-
-        b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y));
+        vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
 
         acc0 += b0 * (a0.x);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -637,71 +571,35 @@
     /* Multiply by the weight of vector-matrix product */
     acc0 = acc0 * vec4(ALPHA);
 
-    uvec2 packed_d;
-    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
 #endif                                 // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
 #elif defined(MM_PROCESS_4X_OPTIMIZED) /* PROCESS_4X */
-BUFFER_DECLARATION(src0, 1, uvec4, readonly);
-BUFFER_DECLARATION(src1, 2, uvec2, readonly);
-BUFFER_DECLARATION(dst, 3, uvec2, writeonly);
+TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
+TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
 
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src0);
-    IMAGE_PARAM_DECLARATION(src1);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- */
 void main()
 {
-    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
-    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
-    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
     /* Compute the address for the vector A and matrix B */
-    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
 
     /* Compute end row address for matrix A */
-    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
 
     /* Reset accumulators */
     vec4 acc0 = vec4(0.0f);
@@ -716,48 +614,29 @@
     vec4 acc3 = vec4(0.0f);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
-        uvec4 packed_a;
-        vec4  a0[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
-        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
 
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec4 a1[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
-        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec4 a2[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
-        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 a3[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
-        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        uvec2 packed_b;
-        vec4  b;
+        vec4 b;
 
         for(int i = 0; i < 8; i++)
         {
             int j = i >> 2;
             int k = i % 4;
 
-            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
-
-            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
 
             acc0 += b * vec4(a0[j][k]);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -772,39 +651,21 @@
         }
     }
 
-    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
-        uvec4 packed_a;
-        vec4  a0[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
-        a0[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a0[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
 
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec4 a1[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 1);
-        a1[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a1[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec4 a2[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 2);
-        a2[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a2[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 a3[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 3);
-        a3[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a3[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
+        vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 
-        uvec2 packed_b;
-        vec4  b;
+        vec4 b;
 
         int leftover = COLS_A % 8;
 
@@ -813,9 +674,7 @@
             int j = i >> 2;
             int k = i % 4;
 
-            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
-
-            b = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
+            b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
 
             acc0 += b * vec4(a0[j][k]);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
@@ -833,71 +692,35 @@
     /* Multiply by the weight of vector-matrix product */
     acc0 = acc0 * vec4(ALPHA);
 
-    uvec2 packed_d;
-    packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    packed_d = uvec2(packHalf2x16(acc1.xy), packHalf2x16(acc1.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 1);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    packed_d = uvec2(packHalf2x16(acc2.xy), packHalf2x16(acc2.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 2);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
 #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
 #if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    packed_d = uvec2(packHalf2x16(acc3.xy), packHalf2x16(acc3.zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 3);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
 #endif                       // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
 }
-#elif defined(MM_PROCESS_8X) /* PROCESS_4X */
-BUFFER_DECLARATION(src0, 1, uvec4, readonly);
-BUFFER_DECLARATION(src1, 2, uvec4, readonly);
-BUFFER_DECLARATION(dst, 3, uvec4, writeonly);
+#elif defined(MM_PROCESS_8X) /* PROCESS_8X */
+TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
+TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(src0);
-    IMAGE_PARAM_DECLARATION(src1);
-    IMAGE_PARAM_DECLARATION(dst);
-};
-
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- */
 void main()
 {
-    Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0);
-    Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1);
-    Image dst  = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
 
     int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
     /* Compute the address for the vector A and matrix B */
-    src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
 
     /* Compute end row address for matrix A */
-    uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1);
+    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
 
     /* Reset accumulators */
     vec4 acc[2];
@@ -905,44 +728,29 @@
     acc[0] = vec4(0.0f);
     acc[1] = vec4(0.0f);
 
-    for(; int(src0.current_offset) < int(end_row_vec_a - uint(16)); src0.current_offset += uint(8) * src0_stride_x, src1.current_offset += uint(8) * src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
-        uvec4 packed_a;
-        vec4  a[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
-        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
-
-        uvec4 packed_b;
-        vec4  b[2];
+        vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
+        vec4 b[2];
 
         for(int i = 0; i < 8; i++)
         {
             int j = i >> 2;
             int k = i % 4;
 
-            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
-
-            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
-            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+            b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
 
             acc[0] += b[0] * vec4(a[j][k]);
             acc[1] += b[1] * vec4(a[j][k]);
         }
     }
 
-    for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 8), src1.current_offset += uint(8) * src1_stride_y)
+    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * uint(2)), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
     {
-        uvec4 packed_a;
-        vec4  a[2];
-
-        GC_LOAD1_2D_OFFSET(packed_a, src0, 0, 0);
-        a[0] = vec4(unpackHalf2x16(packed_a.x), unpackHalf2x16(packed_a.y));
-        a[1] = vec4(unpackHalf2x16(packed_a.z), unpackHalf2x16(packed_a.w));
-
-        uvec4 packed_b;
-        vec4  b[2];
+        vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
+        vec4 b[2];
 
         int leftover = COLS_A % 8;
 
@@ -951,10 +759,7 @@
             int j = i >> 2;
             int k = i % 4;
 
-            GC_LOAD1_2D_OFFSET(packed_b, src1, 0, i);
-
-            b[0] = vec4(unpackHalf2x16(packed_b.x), unpackHalf2x16(packed_b.y));
-            b[1] = vec4(unpackHalf2x16(packed_b.z), unpackHalf2x16(packed_b.w));
+            b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
 
             acc[0] += b[0] * vec4(a[j][k]);
             acc[1] += b[1] * vec4(a[j][k]);
@@ -965,102 +770,180 @@
     acc[0] = acc[0] * vec4(ALPHA);
     acc[1] = acc[1] * vec4(ALPHA);
 
-    uvec4 packed_d;
-    packed_d = uvec4(packHalf2x16(acc[0].xy), packHalf2x16(acc[0].zw), packHalf2x16(acc[1].xy), packHalf2x16(acc[1].zw));
-    GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc);
 }
-#endif                       /* PROCESS_4X */
+#endif                       /* PROCESS_8X */
 #endif                       /* GEMM_MM_FLOATING_POINT */
 
 #ifdef GEMM_ACCUMULATE_BIASES
 #if defined(ACCUM_PROCESS_4X)
-BUFFER_DECLARATION(accum, 1, uvec2, restrict);
-BUFFER_DECLARATION(biases, 2, uvec2, readonly);
-
-layout(std140) uniform shader_params
-{
-    IMAGE_PARAM_DECLARATION(accum);
-    VECTOR_PARAM_DECLARATION(biases);
-};
-
 /** This kernel accumulates each row with the biases vector
  *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in, out] accum_ptr    Pointer to the accumulate tensor. Supported data type: F16
+ * @param[in]      accum_attrs  The attributes of the accumulate tensor
+ * @param[in]      biases_ptr   Pointer to the biases vector. Same as @p accum_ptr
+ * @param[in]      biases_attrs The attributes of the biases tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes  accum_attrs;
+    VectorAttributes biases_attrs;
+};
+
+TENSOR_DECLARATION(1, accumBuffer, uvec2, accum_ptr, accum_shift, 3, restrict);
+TENSOR_DECLARATION(2, biasesBuffer, uvec2, biases_ptr, biases_shift, 3, readonly);
+
 void main(void)
 {
-    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
 
-    vec4  u[2];
-    uvec2 packed_s[2];
-    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
-    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
-    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+    vec4 u[2];
+    u[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
+    u[1] = LOAD_UNPACK4_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
 
     vec4 tmp;
-    tmp         = u[0] + u[1];
-    packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw));
-    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    tmp = u[0] + u[1];
+    STORE_PACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter, tmp);
 }
-#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
-BUFFER_DECLARATION(accum, 1, uvec4, restrict);
-BUFFER_DECLARATION(biases, 2, uvec4, readonly);
-
-layout(std140) uniform shader_params
+#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_8X */
+SHADER_PARAMS_DECLARATION
 {
-    IMAGE_PARAM_DECLARATION(accum);
-    VECTOR_PARAM_DECLARATION(biases);
+    ImageAttributes  accum_attrs;
+    VectorAttributes biases_attrs;
 };
 
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
+TENSOR_DECLARATION(1, accumBuffer, uvec4, accum_ptr, accum_shift, 4, restrict);
+TENSOR_DECLARATION(2, biasesBuffer, uvec4, biases_ptr, biases_shift, 4, readonly);
+
 void main(void)
 {
-    Image  accum  = GC_CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases);
+    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
 
-    vec4  u[2];
-    vec4  v[2];
-    uvec4 packed_s[2];
-    GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0);
-    GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0);
-
-    u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    u[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
-
-    v[0] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-    v[1] = vec4(unpackHalf2x16(packed_s[1].z), unpackHalf2x16(packed_s[1].w));
+    vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
+    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, bias_iter);
 
     vec4 r[2];
-    r[0]        = u[0] + v[0];
-    r[1]        = u[1] + v[1];
-    packed_s[0] = uvec4(packHalf2x16(r[0].xy), packHalf2x16(r[0].zw), packHalf2x16(r[1].xy), packHalf2x16(r[1].zw));
-    GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
+    r[0] = u[0] + v[0];
+    r[1] = u[1] + v[1];
+    STORE_PACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter, r);
 }
-#endif                          /* ACCUM_PROCESS_4X */
+#endif                          /* ACCUM_PROCESS_8X */
 #endif                          /* GEMM_ACCUMULATE_BIASES */
-#else                           /* DATA_TYPE_FP32 */
+
+#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED
+/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
+ *
+ * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ *
+ * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, uvec2, src0_ptr, src0_shift, 3, readonly);
+TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+
+void main()
+{
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    /* Compute address for matrix A and B */
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
+    /* Compute end row address for matrix B */
+    int end_row_mtx_b = (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) + int(COLS_B);
+
+    /* Reset accumulators */
+    vec4 c00[2];
+    vec4 c10[2];
+    vec4 c20[2];
+    vec4 c30[2];
+    c00[0] = vec4(0.0f);
+    c00[1] = vec4(0.0f);
+    c10[0] = vec4(0.0f);
+    c10[1] = vec4(0.0f);
+    c20[0] = vec4(0.0f);
+    c20[1] = vec4(0.0f);
+    c30[0] = vec4(0.0f);
+    c30[1] = vec4(0.0f);
+
+    // FIXME: loop unrolling really needed for GLES?
+    for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0    = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
+        vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
+
+        c00[0] += vec4(a0.x) * b0[0];
+        c00[1] += vec4(a0.x) * b0[1];
+        c10[0] += vec4(a0.y) * b0[0];
+        c10[1] += vec4(a0.y) * b0[1];
+        c20[0] += vec4(a0.z) * b0[0];
+        c20[1] += vec4(a0.z) * b0[1];
+        c30[0] += vec4(a0.w) * b0[0];
+        c30[1] += vec4(a0.w) * b0[1];
+
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        a0 = LOAD_UNPACK4_HALF(src0_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, 8));
+        b0 = LOAD_UNPACK8_HALF(src1_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src1_iter, 16));
+
+        c00[0] += vec4(a0.x) * b0[0];
+        c00[1] += vec4(a0.x) * b0[1];
+        c10[0] += vec4(a0.y) * b0[0];
+        c10[1] += vec4(a0.y) * b0[1];
+        c20[0] += vec4(a0.z) * b0[0];
+        c20[1] += vec4(a0.z) * b0[1];
+        c30[0] += vec4(a0.w) * b0[0];
+        c30[1] += vec4(a0.w) * b0[1];
+    }
+
+    for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 16))
+    {
+        /* Load values from matrix A (interleaved) and matrix B (transposed) */
+        vec4 a0    = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
+        vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
+
+        c00[0] += vec4(a0.x) * b0[0];
+        c00[1] += vec4(a0.x) * b0[1];
+        c10[0] += vec4(a0.y) * b0[0];
+        c10[1] += vec4(a0.y) * b0[1];
+        c20[0] += vec4(a0.z) * b0[0];
+        c20[1] += vec4(a0.z) * b0[1];
+        c30[0] += vec4(a0.w) * b0[0];
+        c30[1] += vec4(a0.w) * b0[1];
+    }
+
+    /* Multiply by the weight of matrix product */
+    c00[0] = c00[0] * vec4(ALPHA);
+    c00[1] = c00[1] * vec4(ALPHA);
+    c10[0] = c10[0] * vec4(ALPHA);
+    c10[1] = c10[1] * vec4(ALPHA);
+    c20[0] = c20[0] * vec4(ALPHA);
+    c20[1] = c20[1] * vec4(ALPHA);
+    c30[0] = c30[0] * vec4(ALPHA);
+    c30[1] = c30[1] * vec4(ALPHA);
+
+    /* Store 4x8 block */
+    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
+    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
+    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
+    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
+}
+#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
+#else  /* DATA_TYPE_FP16 */
 #error Data type not supported
 #endif /* DATA_TYPE_FP32 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
deleted file mode 100644
index 62c58d5..0000000
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers.h
+++ /dev/null

@@ -1,582 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_HELPER_H
-#define ARM_COMPUTE_HELPER_H
-
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-
-#define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
-
-#define CONVERT(x, type) type(x)
-
-#define PACK(value, stype, dtype) \
-    pack_##stype##_##dtype(value)
-
-#define UNPACK(value, stype, dtype) \
-    unpack_##stype##_##dtype(value)
-
-#define BUFFER_DECLARATION(name, location, type, access)          \
-    layout(std430, binding = location) access buffer name##Buffer \
-    {                                                             \
-        type name##_ptr[];                                        \
-    }
-
-#define VECTOR_PARAM_DECLARATION(name)         \
-    uint name##_stride_x;                      \
-    uint name##_step_x;                        \
-    uint name##_offset_first_element_in_bytes; \
-    uint name##_buffer_data_type_size
-
-#define IMAGE_PARAM_DECLARATION(name)          \
-    uint name##_stride_x;                      \
-    uint name##_step_x;                        \
-    uint name##_stride_y;                      \
-    uint name##_step_y;                        \
-    uint name##_offset_first_element_in_bytes; \
-    uint name##_buffer_data_type_size;         \
-    uint name##_padding1;                      \
-    uint name##_padding2
-
-#define TENSOR3D_PARAM_DECLARATION(name)       \
-    uint name##_stride_x;                      \
-    uint name##_step_x;                        \
-    uint name##_stride_y;                      \
-    uint name##_step_y;                        \
-    uint name##_stride_z;                      \
-    uint name##_step_z;                        \
-    uint name##_offset_first_element_in_bytes; \
-    uint name##_buffer_data_type_size
-
-/** Structure to hold Vector information */
-struct Vector
-{
-    uint current_offset;                /**< Current offset of vector */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-};
-
-/** Structure to hold Image information */
-struct Image
-{
-    uint current_offset;                /**< Current offset of image */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-};
-
-/** Structure to hold 3D tensor information */
-struct Tensor3D
-{
-    uint current_offset;                /**< Current offset of tensor */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */
-    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-    uint stride_z;                      /**< Stride of the image in Z dimension (in bytes) */
-};
-
-/////////////////////////////////////////////////////////////
-
-#define CONVERT_TO_VECTOR_STRUCT(name) \
-    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
-
-#define CONVERT_TO_VECTOR_STRUCT_FP16(name) \
-    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
-
-#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
-    update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
-
-#define CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(name) \
-    update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
-
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
-
-#define CONVERT_TO_IMAGE_STRUCT_FP16(name) \
-    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
-
-#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
-
-#define CONVERT_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
-    update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \
-    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_FP16(name) \
-    update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
-    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                    name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT_FP16(name)                                                                                                  \
-    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                         name##_stride_z, name##_step_z)
-
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
-
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(name) \
-    update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
-
-#define LOAD4(name, offset) \
-    name##_ptr[offset]
-
-#define STORE4(name, offset, value) \
-    name##_ptr[offset] = value
-
-// Load 1 element, which size is determined by ssbo type.
-#define LOAD1(r, name, offset) \
-    r = name##_ptr[offset]
-
-#define STORE1(name, offset, value) \
-    name##_ptr[offset] = value
-
-#define LOAD2(r, name, offset) \
-    LOAD1(r[0], name, offset); \
-    LOAD1(r[1], name, (offset) + uint(1))
-
-#define STORE2(name, offset, value)            \
-    name##_ptr[offset]             = value[0]; \
-    name##_ptr[(offset) + uint(1)] = value[1]
-
-#define LOAD3(r, name, offset)             \
-    LOAD1(r[0], name, offset);             \
-    LOAD1(r[1], name, (offset) + uint(1)); \
-    LOAD1(r[2], name, (offset) + uint(2))
-
-#define CURRENT_OFFSET(name) \
-    name.current_offset
-
-/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- *
- * @return An vector object
- */
-Vector update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
-{
-    Vector vector;
-    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    vector.stride_x                      = stride_x;
-    vector.current_offset                = (vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x) >> 2;
-
-    return vector;
-}
-
-/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- *
- * @return An vector object
- */
-Vector update_vector_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
-{
-    Vector vector;
-    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    vector.stride_x                      = stride_x;
-    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
-
-    return vector;
-}
-
-/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- *
- * @return An image object
- */
-Image update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y) >> 2;
-
-    return img;
-}
-
-/** Wrap image information into an Image structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- *
- * @return An image object
- */
-Image update_image_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
-
-    return img;
-}
-
-/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
- *
- * @return A 2D Image object
- */
-Image update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
-
-    return img;
-}
-
-/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
- *
- * @return A 2D Image object
- */
-Image update_image_from_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
-
-    return img;
-}
-
-/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-Tensor3D update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Tensor3D tensor;
-    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    tensor.stride_x                      = stride_x;
-    tensor.stride_y                      = stride_y;
-    tensor.stride_z                      = stride_z;
-    tensor.current_offset                = (tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2;
-
-    return tensor;
-}
-
-/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position.
- *
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
- *
- * @return A 3D tensor object
- */
-Tensor3D update_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Tensor3D tensor;
-    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    tensor.stride_x                      = stride_x;
-    tensor.stride_y                      = stride_y;
-    tensor.stride_z                      = stride_z;
-    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
-
-    return tensor;
-}
-
-/** Get the pointer position of a Vector
- *
- * @param[in] vec Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- */
-uint vector_offset(Vector vec, int x)
-{
-    return CONVERT(CONVERT(vec.current_offset << 2, int) + x * CONVERT(vec.stride_x, int), uint) >> 2;
-}
-
-/** Get the pointer position of a Vector
- *
- * @param[in] vec Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- */
-uint vector_offset_fp16(Vector vec, int x)
-{
-    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
-}
-
-/** Get the pointer position of a Image
- *
- * @param[in] img Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- * @param[in] y   Relative Y position
- */
-uint offset(Image img, int x, int y)
-{
-    return CONVERT(CONVERT(img.current_offset << 2, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint) >> 2;
-}
-
-/** Get the pointer position of a Image
- *
- * @param[in] img Pointer to the starting position of the buffer
- * @param[in] x   Relative X position
- * @param[in] y   Relative Y position
- */
-uint offset_fp16(Image img, int x, int y)
-{
-    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
-}
-
-/** Get the pointer position of a Tensor3D
- *
- * @param[in] tensor Pointer to the starting postion of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- */
-uint tensor3D_offset(Tensor3D tensor, int x, int y, int z)
-{
-    return CONVERT(CONVERT(tensor.current_offset << 2, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint) >> 2;
-}
-
-/** Get the pointer position of a Tensor3D
- *
- * @param[in] tensor Pointer to the starting postion of the buffer
- * @param[in] x      Relative X position
- * @param[in] y      Relative Y position
- * @param[in] z      Relative Z position
- */
-uint tensor3D_offset_fp16(Tensor3D tensor, int x, int y, int z)
-{
-    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
-}
-
-/////////////////////////////////////////////////////////////
-// new one
-
-#define GC_CONVERT_TO_VECTOR_STRUCT(name) \
-    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
-
-#define GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
-    gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0))
-
-#define GC_CONVERT_TO_IMAGE_STRUCT(name) \
-    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
-
-#define GC_CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
-    gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0))
-
-#define GC_CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                  \
-    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                       name##_stride_z, name##_step_z)
-
-#define GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0))
-
-#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
-
-#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z)
-
-Vector gc_update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x)
-{
-    Vector vector;
-    vector.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    vector.stride_x                      = stride_x;
-    vector.current_offset                = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x;
-
-    return vector;
-}
-
-Image gc_update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y;
-
-    return img;
-}
-
-Tensor3D gc_update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Tensor3D tensor;
-    tensor.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    tensor.stride_x                      = stride_x;
-    tensor.stride_y                      = stride_y;
-    tensor.stride_z                      = stride_z;
-    tensor.current_offset                = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
-
-    return tensor;
-}
-
-Image gc_update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Image img;
-    img.offset_first_element_in_bytes = offset_first_element_in_bytes;
-    img.stride_x                      = stride_x;
-    img.stride_y                      = stride_y;
-    img.current_offset                = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z;
-
-    return img;
-}
-
-#define GC_CURRENT_OFFSET(name) \
-    name.current_offset
-
-uint gc_vector_offset(Vector vec, int x)
-{
-    return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint);
-}
-
-uint gc_image_offset(Image img, int x, int y)
-{
-    return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint);
-}
-
-uint gc_tensor3D_offset(Tensor3D tensor, int x, int y, int z)
-{
-    return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint);
-}
-
-// load/store number of element depends on buffer type
-#define GC_LOAD1(r, name, offset) \
-    r = name##_ptr[offset]
-
-#define GC_LOAD2(r, name, offset) \
-    GC_LOAD1(r[0], name, offset); \
-    GC_LOAD1(r[1], name, (offset) + uint(1))
-
-#define GC_LOAD3(r, name, offset)             \
-    GC_LOAD1(r[0], name, offset);             \
-    GC_LOAD1(r[1], name, (offset) + uint(1)); \
-    GC_LOAD1(r[2], name, (offset) + uint(2))
-
-#define GC_STORE1(value, name, offset) \
-    name##_ptr[offset] = value
-
-#define GC_STORE2(value, name, offset) \
-    GC_STORE1(value[0], name, offset); \
-    GC_STORE1(value[1], name, (offset) + uint(1))
-
-#define GC_STORE3(value, name, offset)             \
-    GC_STORE1(value[0], name, offset);             \
-    GC_STORE1(value[1], name, (offset) + uint(1)); \
-    GC_STORE1(value[2], name, (offset) + uint(2))
-
-// has to manually expand them since not supported by compiler
-#define GC_LOAD1_1D_OFFSET(r, name, x) \
-    GC_LOAD1(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD1_2D_OFFSET(r, name, x, y) \
-    GC_LOAD1(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD1_3D_OFFSET(r, name, x, y, z) \
-    GC_LOAD1(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
-
-#define GC_STORE1_1D_OFFSET(value, name, x) \
-    GC_STORE1(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
-
-#define GC_STORE1_2D_OFFSET(value, name, x, y) \
-    GC_STORE1(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
-
-#define GC_STORE1_3D_OFFSET(value, name, x, y, z) \
-    GC_STORE1(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD2_1D_OFFSET(r, name, x) \
-    GC_LOAD2(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD2_2D_OFFSET(r, name, x, y) \
-    GC_LOAD2(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD2_3D_OFFSET(r, name, x, y, z) \
-    GC_LOAD2(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
-
-#define GC_STORE2_1D_OFFSET(value, name, x) \
-    GC_STORE2(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
-
-#define GC_STORE2_2D_OFFSET(value, name, x, y) \
-    GC_STORE2(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
-
-#define GC_STORE2_3D_OFFSET(value, name, x, y, z) \
-    GC_STORE2(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD3_1D_OFFSET(r, name, x) \
-    GC_LOAD3(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD3_2D_OFFSET(r, name, x, y) \
-    GC_LOAD3(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size)
-
-#define GC_LOAD3_3D_OFFSET(r, name, x, y, z) \
-    GC_LOAD3(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size)
-
-/////////////////////////////////////////////////////////////
-
-#endif // _HELPER_H

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
old mode 100755
new mode 100644
index 404b46a..014ff40
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -222,6 +222,9 @@
 #define TENSOR3D_OFFSET(tensor_iter, x, y, z) \
     uint(tensor3D_offset_in_bytes(tensor_iter, int(x), int(y), int(z)) >> tensor_iter.element_shift)
 
+#define TENSOR_OFFSET_ADVANCE(tensor_iter, n) \
+    uint((tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift) + int(n))
+
 #define TENSOR_OFFSET_ADVANCE_IN_BYTES(tensor_iter, n) \
     uint((tensor_iter.current_offset_in_bytes + int(n)) >> tensor_iter.element_shift)
 
@@ -231,9 +234,15 @@
 #define CURRENT_ITEM_OFFSET_IN_BYTES(tensor_iter) \
     uint(tensor_iter.current_offset_in_bytes)
 
+#define TENSOR_ITERATOR_ADVANCE(tensor_iter, n) \
+    tensor_iter.current_offset_in_bytes += (int(n) << tensor_iter.element_shift)
+
 #define TENSOR_ITERATOR_ADVANCE_IN_BYTES(tensor_iter, n) \
     tensor_iter.current_offset_in_bytes += int(n)
 
+#define SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(tensor_iter, n) \
+    tensor_iter.current_offset_in_bytes = int(n)
+
 /** Get the offset of a VectorIterator
  *
  * @param[in] vector_iter The VectorIterator object pointed to the starting position of the buffer
@@ -317,6 +326,23 @@
 #define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
 #define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
 
+#define VLOAD5(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)), \
+                LOAD(tensor_ptr, (offset) + uint(3)), \
+                LOAD(tensor_ptr, (offset) + uint(4)))
+
+#define VSTORE5(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+    STORE(tensor_ptr, (offset) + uint(3), data[3]); \
+    STORE(tensor_ptr, (offset) + uint(4), data[4])
+
+#define VLOAD5_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD5(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE5_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE5(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
 /** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
  *
  * @param[in] data The vec4 object to be packed
@@ -339,6 +365,19 @@
     return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
 }
 
+/** Unpacking the uvec3 object to 6 half-precision (16-bits) floating point values and converting to a vec2[3] object
+ *
+ * @param[in] packed_data The uvec3 object to be unpacked
+ *
+ * @return The unpacked vec2[3] object
+ */
+mediump vec2[3] unpack6_half(highp uvec3 packed_data)
+{
+    return vec2[3](unpackHalf2x16(packed_data[0]),
+                   unpackHalf2x16(packed_data[1]),
+                   unpackHalf2x16(packed_data[2]));
+}
+
 /** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
  *
  * @param[in] data The vec4[2] object to be packed
@@ -363,6 +402,19 @@
                    vec4(unpackHalf2x16(packed_data.z), unpackHalf2x16(packed_data.w)));
 }
 
+/** Unpacking the uvec2[3] object to 12 half-precision (16-bits) floating point values and converting to a vec4[3] object
+ *
+ * @param[in] packed_data The uvec2[3] object to be unpacked
+ *
+ * @return The unpacked vec4[3] object
+ */
+mediump vec4[3] unpack12_half(highp uvec2[3] packed_data)
+{
+    return vec4[3](vec4(unpackHalf2x16(packed_data[0].x), unpackHalf2x16(packed_data[0].y)),
+                   vec4(unpackHalf2x16(packed_data[1].x), unpackHalf2x16(packed_data[1].y)),
+                   vec4(unpackHalf2x16(packed_data[2].x), unpackHalf2x16(packed_data[2].y)));
+}
+
 // For half-precision (16-bits) floating point packed into a "uint" element
 #define LOAD_UNPACK2_HALF(tensor_ptr, offset) unpackHalf2x16(uint(LOAD(tensor_ptr, offset)))
 #define STORE_PACK2_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, packHalf2x16(data))
@@ -374,6 +426,9 @@
 #define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
 #define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
 
+#define VLOAD3_UNPACK6_HALF(tensor_ptr, offset) unpack6_half(VLOAD3(uvec3, tensor_ptr, offset))
+#define VLOAD3_UNPACK6_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK6_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+
 #define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
 #define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
 #define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
@@ -390,6 +445,9 @@
 #define VLOAD2_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
 #define VSTORE2_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
 
+#define VLOAD3_UNPACK12_HALF(tensor_ptr, offset) unpack12_half(VLOAD3(uvec2[3], tensor_ptr, offset))
+#define VLOAD3_UNPACK12_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK12_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+
 // For half-precision (16-bits) floating point packed into a "uvec4" element
 #define LOAD_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(uvec4(LOAD(tensor_ptr, offset)))
 #define STORE_PACK8_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack8_half(data))

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
old mode 100755
new mode 100644
index 166953f..f3cb52e
--- a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs

@@ -24,57 +24,38 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src1);
-    TENSOR3D_PARAM_DECLARATION(src2);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-BUFFER_DECLARATION(src1, 1, float, readonly);
-BUFFER_DECLARATION(src2, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-
-#ifdef CROSS_MAP
-/** Apply cross map normalization.
+/** Apply cross map normalization and in map normalization
  *
  * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
  * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
  * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
  * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
  *
- * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_attrs The attributes of the first source tensor
+ * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src1_attrs;
+    Tensor3DAttributes src2_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(2, src2Buffer, float, src2_ptr, src2_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
+#ifdef CROSS_MAP
 void main(void)
 {
-    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
-    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     float acc = 0.0;
 
@@ -86,54 +67,22 @@
 
     for(int i = left_slice; i <= right_slice; i++)
     {
-        acc += src2_ptr[tensor3D_offset(src2, 0, 0, i - current_slice)];
+        acc += LOAD(src2_ptr, TENSOR3D_OFFSET(src2_iter, 0, 0, i - current_slice));
     }
 
     float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
 
-    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+    float normalized_pixel = (LOAD_CURRENT_ITEM(src1_ptr, src1_iter)) / normalized;
 
-    dst_ptr[dst.current_offset] = normalized_pixel;
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, normalized_pixel);
 }
 
 #elif defined(IN_MAP_1D)
-/** Apply in map normalization.
- *
- * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
- * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
- * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
- * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
- *
- * @param[in]  src1_ptr                           Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in]  src2_ptr                           Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the second source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the second source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the second source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
 void main(void)
 {
-    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
-    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     float acc = 0.0;
 
@@ -145,13 +94,13 @@
 
     for(int i = left_pos; i <= right_pos; i++)
     {
-        acc += src2_ptr[tensor3D_offset(src2, i - current_pos, 0, 0)];
+        acc += LOAD(src2_ptr, TENSOR3D_OFFSET(src2_iter, i - current_pos, 0, 0));
     }
 
     float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
 
-    float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized;
+    float normalized_pixel = (LOAD_CURRENT_ITEM(src1_ptr, src1_iter)) / normalized;
 
-    dst_ptr[dst.current_offset] = normalized_pixel;
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, normalized_pixel);
 }
 #endif /*CROSS_MAP*/

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs
new file mode 100644
index 0000000..18a9af7
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs

@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+precision mediump float;
+
+/** Apply normalize_planar_yuv layer.
+ *
+ * @param[in]  src_ptr    Pointer to the first source tensor. Supported data types: F16
+ * @param[in]  src_attrs  The attributes of the source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
+ * @param[in]  mean_ptr   Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_attrs The attributes of the mean tensor
+ * @param[in]  sd_ptr     Standard deviation values tensor,pointer to the sd tensor. Supported data types: same as @p src_ptr
+ * @param[in]  sd_attrs   The attributes of the sd tensor
+ */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    VectorAttributes   mean_attrs;
+    VectorAttributes   sd_attrs;
+};
+
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
+TENSOR_DECLARATION(4, sdBuffer, uvec2, sd_ptr, sd_shift, 3, readonly);
+
+void main(void)
+{
+    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+    VectorIterator   mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
+    VectorIterator   sd_iter   = CONVERT_TO_VECTOR_ITERATOR(sd_attrs, sd_shift);
+
+    vec4 unpacked_s[3];
+    vec4 tmp;
+    vec4 result;
+
+    uint current_slice = gl_GlobalInvocationID.z;
+    unpacked_s[0]      = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    unpacked_s[1]      = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
+    unpacked_s[2]      = LOAD_UNPACK4_HALF(sd_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(sd_iter, current_slice * sd_attrs.stride_x));
+
+    if((current_slice % uint(4)) == uint(0))
+    {
+        tmp    = unpacked_s[0] - unpacked_s[1].x;
+        result = tmp / unpacked_s[2].x;
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else if((current_slice % uint(4)) == uint(1))
+    {
+        tmp    = unpacked_s[0] - unpacked_s[1].y;
+        result = tmp / unpacked_s[2].y;
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else if((current_slice % uint(4)) == uint(2))
+    {
+        tmp    = unpacked_s[0] - unpacked_s[1].z;
+        result = tmp / unpacked_s[2].z;
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+    else
+    {
+        tmp    = unpacked_s[0] - unpacked_s[1].w;
+        result = tmp / unpacked_s[2].w;
+
+        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
+    }
+}

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
index 031687a..01e0f8a 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs

@@ -23,53 +23,36 @@
  */
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
 
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src1);
-    TENSOR3D_PARAM_DECLARATION(src2);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
+#include "helpers_cs.h"
 
-BUFFER_DECLARATION(src1, 1, float, readonly);
-BUFFER_DECLARATION(src2, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-
-/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+/** Performs a pixelwise multiplication with float scale of float inputs.
  *
- * @param[in]  src1_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src1_step_x                        src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src1_step_z                        src1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  src2_ptr                           Pointer to the source image. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src2_step_x                        src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src2_step_z                        src2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  scale                              Float scaling factor. Supported data types: F32
+ * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F32
+ * @param[in]  src1_attrs The attributes of the first source tensor
+ * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  src2_attrs The attributes of the second source tensor
+ * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in]  dst_attrs  The attributes of the destination tensor
+ * @param[in]  scale      Float scaling factor. Supported data types: F32
  */
+SHADER_PARAMS_DECLARATION
+{
+    Tensor3DAttributes src1_attrs;
+    Tensor3DAttributes src2_attrs;
+    Tensor3DAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(2, src2Buffer, float, src2_ptr, src2_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 void main()
 {
     // Get pixels pointer
-    Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1);
-    Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
+    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
+    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    dst_ptr[dst.current_offset] = (src1_ptr[src1.current_offset] * src2_ptr[src2.current_offset] * float(SCALE));
+    float result = LOAD_CURRENT_ITEM(src1_ptr, src1_iter) * LOAD_CURRENT_ITEM(src2_ptr, src2_iter) * float(SCALE);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, result);
 }

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
index 401b002..aa639b2 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs

@@ -23,38 +23,37 @@
  */
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
 
-#if defined(DATA_TYPE_FP32)
+#include "helpers_cs.h"
 
-float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
 
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-
-layout(std140) uniform shader_params
+/** Performs a pooling function
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2"
+ *       n must be one of these: 2, 3, 7, N
+ *       Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       POOL_AVG must be provided otherwise max pooling will be performed.
+ *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
+ *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  src_ptr   Pointer to the source image. Supported data types: F32/F16
+ * @param[in]  src_attrs The attributes of the source image
+ * @param[out] dst_ptr   Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  src_attrs The attributes of the destination image
+ */
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
 };
 
-#define LOAD8(r, name, offset) \
-    r.x = LOAD4(name, offset); \
-    r.y = LOAD4(name, offset + uint(1))
-
-#define LOAD16(r, name, offset)          \
-    r.x = LOAD4(name, offset);           \
-    r.y = LOAD4(name, offset + uint(1)); \
-    r.z = LOAD4(name, offset + uint(2)); \
-    r.w = LOAD4(name, offset + uint(3))
-
-#define STORE16(name, offset, r)         \
-    STORE4(name, offset, r.x);           \
-    STORE4(name, offset + uint(1), r.y); \
-    STORE4(name, offset + uint(2), r.z); \
-    STORE4(name, offset + uint(3), r.w)
-
+// Common definitions
 #if defined(POOL_AVG) || defined(POOL_L2)
 #define POOL_OP(res, a, b) ((res) = (a) + (b))
 #define POOL_OP_float(res, a, b) (res = a + b)
@@ -105,6 +104,14 @@
 #define DIV_OP(x, y) (x * (1.f / y))
 #define SQRT_OP(x) sqrt((x))
 
+#if defined(DATA_TYPE_FP32)
+
+float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
 #if defined(POOL_SIZE)
 // Set the initial value for the pooling operation accordingly with the data type
 #if defined(POOL_AVG) || defined(POOL_L2)
@@ -114,25 +121,111 @@
 #endif // POOL_AVG
 #endif //POOL_SIZE
 
-#define POOLING3x3_STRIDE1(res, input, output)                                                                     \
-    vec4 data00;                                                                                                   \
-    vec2 data01;                                                                                                   \
-    vec4 data10;                                                                                                   \
-    vec2 data11;                                                                                                   \
-    vec4 data20;                                                                                                   \
-    vec2 data21;                                                                                                   \
-    LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0));                                                        \
-    LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                               \
-    LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0));                                                        \
-    LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                               \
-    LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0));                                                        \
-    LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                               \
-    data00 = POW2_OP(data00, 4);                                                                                   \
-    data01 = POW2_OP(data01, 2);                                                                                   \
-    data10 = POW2_OP(data10, 4);                                                                                   \
-    data11 = POW2_OP(data11, 2);                                                                                   \
-    data20 = POW2_OP(data20, 4);                                                                                   \
-    data21 = POW2_OP(data21, 2);                                                                                   \
+float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_max;
+    data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+
+    for(int i = 0; (start_y + i) < end_y; ++i)
+    {
+        for(int j = 0; (start_x + j) < end_x; ++j)
+        {
+            float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
+            POOL_OP_float(data_max, data_max, data);
+        }
+    }
+
+    return data_max;
+}
+
+float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x   = int(min(start_x + pool_size, upper_bound_w));
+    int end_y   = int(min(start_y + pool_size, upper_bound_h));
+
+    float data_total = 0.0f;
+    for(int i = 0; (start_x + i) < end_x; i++)
+    {
+        for(int j = 0; (start_y + j) < end_y; ++j)
+        {
+            float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0));
+            if(isnan(data))
+            {
+                data = 0.0f;
+            }
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data = POW2_OP(data, 1);
+#endif /* defined(POOL_L2) */
+            data_total = data_total + data;
+        }
+    }
+
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+
+    return data_total / float((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
+
+#if defined(POOLING_LAYER_2)
+#define POOL_SIZE 2
+#elif defined(POOLING_LAYER_3)
+#define POOL_SIZE 3
+#elif defined(POOLING_LAYER_7)
+#define POOL_SIZE 7
+#else // POOLING_LAYER_n
+#error Please define POOLING_LAYER_N instead.
+#endif // POOLING_LAYER_n
+
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+    //Load and calculate data
+    float res;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+
+#define POOLING3x3_STRIDE1(res, input_ptr, input_iter)                                                             \
+    vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                                   \
+    vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                         \
+    vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                                   \
+    vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                         \
+    vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                                   \
+    vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                         \
+    data00      = POW2_OP(data00, 4);                                                                              \
+    data01      = POW2_OP(data01, 2);                                                                              \
+    data10      = POW2_OP(data10, 4);                                                                              \
+    data11      = POW2_OP(data11, 2);                                                                              \
+    data20      = POW2_OP(data20, 4);                                                                              \
+    data21      = POW2_OP(data21, 2);                                                                              \
     \
     vec4 values000;                                                                                                \
     vec4 values001;                                                                                                \
@@ -167,34 +260,25 @@
     POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
     POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
 
-#define POOLING3x3_STRIDE2(res, input, output)                                                                     \
-    vec4  data000;                                                                                                 \
-    vec4  data001;                                                                                                 \
-    float data010;                                                                                                 \
-    vec4  data100;                                                                                                 \
-    vec4  data101;                                                                                                 \
-    float data11;                                                                                                  \
-    vec4  data200;                                                                                                 \
-    vec4  data201;                                                                                                 \
-    float data21;                                                                                                  \
-    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                                       \
-    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                             \
-    data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                             \
-    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                                       \
-    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                             \
-    data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                              \
-    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                                       \
-    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                             \
-    data21  = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                             \
-    data000 = POW2_OP(data000, 4);                                                                                 \
-    data001 = POW2_OP(data001, 4);                                                                                 \
-    data010 = POW2_OP(data010, 1);                                                                                 \
-    data100 = POW2_OP(data100, 4);                                                                                 \
-    data101 = POW2_OP(data101, 4);                                                                                 \
-    data11  = POW2_OP(data11, 1);                                                                                  \
-    data200 = POW2_OP(data200, 4);                                                                                 \
-    data201 = POW2_OP(data201, 4);                                                                                 \
-    data21  = POW2_OP(data21, 1);                                                                                  \
+#define POOLING3x3_STRIDE2(res, input_ptr, input_iter)                                                             \
+    vec4  data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                                 \
+    vec4  data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                       \
+    float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8));                               \
+    vec4  data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                                 \
+    vec4  data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                       \
+    float data11  = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8));                               \
+    vec4  data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                                 \
+    vec4  data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                       \
+    float data21  = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8));                               \
+    data000       = POW2_OP(data000, 4);                                                                           \
+    data001       = POW2_OP(data001, 4);                                                                           \
+    data010       = POW2_OP(data010, 1);                                                                           \
+    data100       = POW2_OP(data100, 4);                                                                           \
+    data101       = POW2_OP(data101, 4);                                                                           \
+    data11        = POW2_OP(data11, 1);                                                                            \
+    data200       = POW2_OP(data200, 4);                                                                           \
+    data201       = POW2_OP(data201, 4);                                                                           \
+    data21        = POW2_OP(data21, 1);                                                                            \
     \
     vec4 values000;                                                                                                \
     vec4 values001;                                                                                                \
@@ -223,34 +307,25 @@
     POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
     POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
 
-#define POOLING3x3_STRIDE3(res, input, output)                                                         \
-    vec4 data000;                                                                                      \
-    vec4 data001;                                                                                      \
-    vec4 data010;                                                                                      \
-    vec4 data100;                                                                                      \
-    vec4 data101;                                                                                      \
-    vec4 data11;                                                                                       \
-    vec4 data200;                                                                                      \
-    vec4 data201;                                                                                      \
-    vec4 data21;                                                                                       \
-    LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0));                                           \
-    LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4));                                 \
-    LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8));                                 \
-    LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0));                                           \
-    LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4));                                 \
-    LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8));                                  \
-    LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0));                                           \
-    LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4));                                 \
-    LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8));                                  \
-    data000 = POW2_OP(data000, 4);                                                                     \
-    data001 = POW2_OP(data001, 4);                                                                     \
-    data010 = POW2_OP(data010, 4);                                                                     \
-    data100 = POW2_OP(data100, 4);                                                                     \
-    data101 = POW2_OP(data101, 4);                                                                     \
-    data11  = POW2_OP(data11, 4);                                                                      \
-    data200 = POW2_OP(data200, 4);                                                                     \
-    data201 = POW2_OP(data201, 4);                                                                     \
-    data21  = POW2_OP(data21, 4);                                                                      \
+#define POOLING3x3_STRIDE3(res, input_ptr, input_iter)                                                 \
+    vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                      \
+    vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));            \
+    vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8));            \
+    vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                      \
+    vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));            \
+    vec4 data11  = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8));            \
+    vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                      \
+    vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));            \
+    vec4 data21  = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8));            \
+    data000      = POW2_OP(data000, 4);                                                                \
+    data001      = POW2_OP(data001, 4);                                                                \
+    data010      = POW2_OP(data010, 4);                                                                \
+    data100      = POW2_OP(data100, 4);                                                                \
+    data101      = POW2_OP(data101, 4);                                                                \
+    data11       = POW2_OP(data11, 4);                                                                 \
+    data200      = POW2_OP(data200, 4);                                                                \
+    data201      = POW2_OP(data201, 4);                                                                \
+    data21       = POW2_OP(data21, 4);                                                                 \
     \
     POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
     POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
@@ -259,199 +334,22 @@
     POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
     POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
     POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
 
-float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x   = int(min(start_x + pool_size, upper_bound_w));
-    int end_y   = int(min(start_y + pool_size, upper_bound_h));
-
-    float data_max;
-    data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0));
-
-    for(int i = 0; (start_x + i) < end_x; ++i)
-    {
-        for(int j = 0; (start_y + j) < end_y; ++j)
-        {
-            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
-            POOL_OP_float(data_max, data_max, data);
-        }
-    }
-
-    return data_max;
-}
-
-float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x   = int(min(start_x + pool_size, upper_bound_w));
-    int end_y   = int(min(start_y + pool_size, upper_bound_h));
-
-    float data_total = 0.0f;
-    for(int i = 0; (start_x + i) < end_x; i++)
-    {
-        for(int j = 0; (start_y + j) < end_y; ++j)
-        {
-            float data = LOAD4(src, tensor3D_offset(src, i, j, 0));
-            if(isnan(data))
-            {
-                data = 0.0f;
-            }
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data = POW2_OP(data, 1);
-#endif /* defined(POOL_L2) */
-            data_total = data_total + data;
-        }
-    }
-
-    return data_total / float((end_y - start_y) * (end_x - start_x));
-}
-
-#ifdef POOLING_LAYER_2
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
 void main(void)
 {
     // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    //Load and calculate data
-    float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE4(dst, CURRENT_OFFSET(dst), res);
-}
-
-#elif defined(POOLING_LAYER_3)
-/** Performs a pooling function of pool size equal to 3.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    //Load and calculate data
-    float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE4(dst, CURRENT_OFFSET(dst), res);
-}
-
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4 res;
     // Perform pooling 3x3 for 4 output elements
 #if STRIDE_X == 1
-    POOLING3x3_STRIDE1(res, src, dst);
+    POOLING3x3_STRIDE1(res, src_ptr, src_iter);
 #elif STRIDE_X == 2
-    POOLING3x3_STRIDE2(res, src, dst);
+    POOLING3x3_STRIDE2(res, src_ptr, src_iter);
 #elif STRIDE_X == 3
-    POOLING3x3_STRIDE3(res, src, dst);
+    POOLING3x3_STRIDE3(res, src_ptr, src_iter);
 #endif /*STRIDE_X == 1*/
 
     // Divide by pool region in case of average pooling
@@ -460,6 +358,10 @@
     int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
     ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
     int   end_y   = min((start_y + 3), MAX_HEIGHT);
+#if defined(EXCLUDE_PADDING)
+    start_x       = max(ivec4(0), start_x);
+    start_y       = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
     res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
 #endif /*POOL_AVG*/
 
@@ -468,109 +370,28 @@
     res = SQRT_OP(res);
 #endif /* defined(POOL_L2) */
 
-    STORE16(dst, CURRENT_OFFSET(dst), res);
-}
-
-#elif defined(POOLING_LAYER_7)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Supported data types are F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    //Load and calculate data
-    float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE4(dst, CURRENT_OFFSET(dst), res);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res);
 }
 
 #elif defined(POOLING_LAYER_N)
-/** Performs a pooling function of pool size equal to N
- *
- * @note Supported data types are F32;
- * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+
 void main(void)
 {
     // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    vec4 vdata0;
-    vdata0 = vec4(INITIAL_VALUE);
-    vec4 vdata1;
-    vdata1 = vec4(INITIAL_VALUE);
-    float sdata;
-    sdata = float(INITIAL_VALUE);
+    vec4  vdata0 = vec4(INITIAL_VALUE);
+    vec4  vdata1 = vec4(INITIAL_VALUE);
+    float sdata  = float(INITIAL_VALUE);
 
     for(int y = 0; y < int(POOL_SIZE); y++)
     {
         int x = 0;
         for(; x <= (int(POOL_SIZE) - 8); x += 8)
         {
-            vec4 data2;
-            vec4 data3;
-            LOAD16(data2, src, tensor3D_offset(src, x, y, 0));
-            LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4));
+            vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
+            vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4));
 
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
@@ -585,7 +406,7 @@
         // Leftover
         for(; x < int(POOL_SIZE); ++x)
         {
-            float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0));
+            float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data4 *= data4;
@@ -606,12 +427,16 @@
 #if defined(POOL_AVG) || defined(POOL_L2)
     {
         // Divide by pool region in case of average pooling
-        int   start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
-        int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-        int   end_x   = int(min(start_x + POOL_SIZE, MAX_WIDTH));
-        int   end_y   = int(min(start_y + POOL_SIZE, MAX_HEIGHT));
-        float res1    = float((end_y - start_y) * (end_x - start_x));
-        res           = DIV_OP(res, res1);
+        int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
+        int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int end_x   = int(min(start_x + POOL_SIZE, MAX_WIDTH));
+        int end_y   = int(min(start_y + POOL_SIZE, MAX_HEIGHT));
+#if defined(EXCLUDE_PADDING)
+        start_x     = max(0, start_x);
+        start_y     = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+        float res1  = float((end_y - start_y) * (end_x - start_x));
+        res         = DIV_OP(res, res1);
     }
 #endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
@@ -621,91 +446,17 @@
 #endif /* defined(POOL_L2) */
 
     // Store result
-    STORE4(dst, CURRENT_OFFSET(dst), res);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
 }
-#endif /* POOLING_LAYER_2 */
+#endif // POOLING_LAYER_N
 
 #elif defined(DATA_TYPE_FP16)
 
-precision mediump float;
+vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
+vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
 
-vec2 load_and_unpack(Tensor3D, uint);
-vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int);
-
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-};
-
-#define LOAD2_fp16(r, name, offset) \
-    r.xy = load_and_unpack(name, offset)
-
-#define LOAD4_fp16(r, name, offset)       \
-    r.xy = load_and_unpack(name, offset); \
-    r.zw = load_and_unpack(name, offset + uint(1))
-
-#define STORE4_fp16(name, offset, r)             \
-    uint datastore1;                             \
-    uint datastore2;                             \
-    datastore1 = uint(packHalf2x16(r.xy));       \
-    datastore2 = uint(packHalf2x16(r.zw));       \
-    STORE1(name, offset << uint(1), datastore1); \
-    STORE1(name, (offset << uint(1)) + uint(1), datastore2)
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(res, a, b) ((res) = (a) + (b))
-#define POOL_OP_float(res, a, b) (res = a + b)
-#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(res, a, b)        \
-    (res) = (a);                  \
-    if(isnan(a.x) || (a.x < b.x)) \
-    {                             \
-        res.x = b.x;              \
-    }                             \
-    if(isnan(a.y) || (a.y < b.y)) \
-    {                             \
-        res.y = b.y;              \
-    }                             \
-    if(isnan(a.z) || (a.z < b.z)) \
-    {                             \
-        res.z = b.z;              \
-    }                             \
-    if(isnan(a.w) || (a.w < b.w)) \
-    {                             \
-        res.w = b.w;              \
-    }
-#define POOL_OP_float(res, a, b) \
-    (res) = (a);                 \
-    if(isnan(a) || (a < b))      \
-    {                            \
-        res = b;                 \
-    }
-#define POOL_OP_vec2(res, a, b)   \
-    (res) = (a);                  \
-    if(isnan(a.x) || (a.x < b.x)) \
-    {                             \
-        res.x = b.x;              \
-    }                             \
-    if(isnan(a.y) || (a.y < b.y)) \
-    {                             \
-        res.y = b.y;              \
-    }
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
 
 #if defined(POOL_SIZE)
 // Set the initial value for the pooling operation accordingly with the data type
@@ -716,25 +467,232 @@
 #endif //POOL_AVG
 #endif //POOL_SIZE
 
-#define POOLING3x3_STRIDE1_fp16(res, input, output)                                                                \
-    vec4 data00;                                                                                                   \
-    vec2 data01;                                                                                                   \
-    vec4 data10;                                                                                                   \
-    vec2 data11;                                                                                                   \
-    vec4 data20;                                                                                                   \
-    vec2 data21;                                                                                                   \
-    LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                  \
-    LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                        \
-    LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                  \
-    LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                        \
-    LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                  \
-    LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                        \
-    data00 = POW2_OP(data00, 4);                                                                                   \
-    data01 = POW2_OP(data01, 2);                                                                                   \
-    data10 = POW2_OP(data10, 4);                                                                                   \
-    data11 = POW2_OP(data11, 2);                                                                                   \
-    data20 = POW2_OP(data20, 4);                                                                                   \
-    data21 = POW2_OP(data21, 2);                                                                                   \
+vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize maximum
+    vec2 data_max = vec2(0);
+
+    //Load and Set initial maximum1
+    vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    data_max.x      = data_init1.x;
+
+    //Load and Set initial maximum2
+    if(end_x1 < upper_bound_w)
+    {
+        if((stride_x % 2) == 0)
+        {
+            vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0));
+            data_max.y      = data_init2.x;
+        }
+        else
+        {
+            vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0));
+            data_max.y      = data_init2.y;
+        }
+    }
+
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            //Calculate maximum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                vec2  data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
+                float data_mr1;
+                POOL_OP_float(data_mr1, data1.x, data1.y);
+                POOL_OP_float(data_max.x, data_max.x, data_mr1);
+            }
+            else
+            {
+                vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
+                POOL_OP_float(data_max.x, data_max.x, data1.x);
+            }
+
+            //Calculate maximum2
+            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
+
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data2.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.x);
+                    }
+                }
+                else
+                {
+                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
+                    vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        float data_mr2;
+                        POOL_OP_float(data_mr2, data3.x, data2.y);
+                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
+                    }
+                    else
+                    {
+                        POOL_OP_float(data_max.y, data_max.y, data2.y);
+                    }
+                }
+            }
+        }
+    return data_max;
+}
+
+vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x;
+    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
+    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
+    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
+
+    int start_x2 = start_x1 + stride_x;
+    int start_y2 = start_y1;
+    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
+    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
+
+    //Initialize sum
+    float data_total1 = float(0);
+    float data_total2 = float(0);
+    for(int i = 0; (start_y1 + i) < end_y1; i++)
+        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
+        {
+            vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+            //Calculate sum1
+            if((start_x1 + j + 1) < end_x1)
+            {
+                data_total1 = data_total1 + data1.x + data1.y;
+            }
+            else
+            {
+                data_total1 = data_total1 + data1.x;
+            }
+
+            //Calculate sum2
+            if((start_x2 + j) < end_x2 && end_x1 <= upper_bound_w)
+            {
+                if((stride_x % 2) == 0)
+                {
+                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data2.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.x;
+                    }
+                }
+                else
+                {
+                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
+                    vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
+#if defined(POOL_L2)
+                    // Raise to power of 2 for L2 Pooling
+                    data2 = POW2_OP(data2, 2);
+                    data3 = POW2_OP(data3, 2);
+#endif /* defined(POOL_L2) */
+                    if((start_x2 + j + 1) < end_x2)
+                    {
+                        data_total2 = data_total2 + data3.x + data2.y;
+                    }
+                    else
+                    {
+                        data_total2 = data_total2 + data2.y;
+                    }
+                }
+            }
+        }
+#if defined(EXCLUDE_PADDING)
+    start_x1 = max(0, start_x1);
+    start_y1 = max(0, start_y1);
+    start_x2 = max(0, start_x2);
+    start_y2 = max(0, start_y2);
+#endif /* defined(EXCLUDE_PADDING) */
+
+    //Calculate average
+    vec2 data_avg;
+    data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
+    data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
+
+    return data_avg;
+}
+
+#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
+
+#if defined(POOLING_LAYER_2)
+#define POOL_SIZE 2
+#elif defined(POOLING_LAYER_3)
+#define POOL_SIZE 3
+#elif defined(POOLING_LAYER_7)
+#define POOL_SIZE 7
+#else // POOLING_LAYER_n
+#error Please define POOLING_LAYER_N instead.
+#endif // POOLING_LAYER_n
+
+void main(void)
+{
+    // Get pixels pointer
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
+
+    //Load and calculate data
+    vec2 data;
+#if defined(POOL_AVG) || defined(POOL_L2)
+    data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#else  /*POOL_AVG*/
+    data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /*POOL_AVG*/
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    data = SQRT_OP(data);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
+}
+
+#elif defined(POOLING_LAYER_3_OPTIMIZED)
+
+#define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter)                                                        \
+    vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                            \
+    vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));                    \
+    vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                            \
+    vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));                    \
+    vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                            \
+    vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));                    \
+    data00      = POW2_OP(data00, 4);                                                                              \
+    data01      = POW2_OP(data01, 2);                                                                              \
+    data10      = POW2_OP(data10, 4);                                                                              \
+    data11      = POW2_OP(data11, 2);                                                                              \
+    data20      = POW2_OP(data20, 4);                                                                              \
+    data21      = POW2_OP(data21, 2);                                                                              \
     \
     vec4 values000;                                                                                                \
     vec4 values001;                                                                                                \
@@ -769,7 +727,7 @@
     POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
     POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
 
-#define POOLING3x3_STRIDE2_fp16(res, input, output)                                                                \
+#define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter)                                                        \
     vec4  data000;                                                                                                 \
     vec4  data001;                                                                                                 \
     float data010;                                                                                                 \
@@ -782,17 +740,17 @@
     vec2  datamiddle0;                                                                                             \
     vec2  datamiddle1;                                                                                             \
     vec2  datamiddle2;                                                                                             \
-    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                                 \
-    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));                       \
-    datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));             \
+    data000     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                            \
+    data001     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));                  \
+    datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                    \
     data010     = datamiddle0.x;                                                                                   \
-    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                                 \
-    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));                       \
-    datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));             \
+    data100     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                            \
+    data101     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));                  \
+    datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                    \
     data11      = datamiddle1.x;                                                                                   \
-    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                                 \
-    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));                       \
-    datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));             \
+    data200     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                            \
+    data201     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));                  \
+    datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                    \
     data21      = datamiddle2.x;                                                                                   \
     data000     = POW2_OP(data000, 4);                                                                             \
     data001     = POW2_OP(data001, 4);                                                                             \
@@ -831,34 +789,25 @@
     POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
     POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
 
-#define POOLING3x3_STRIDE3_fp16(res, input, output)                                                    \
-    vec4 data000;                                                                                      \
-    vec4 data001;                                                                                      \
-    vec4 data010;                                                                                      \
-    vec4 data100;                                                                                      \
-    vec4 data101;                                                                                      \
-    vec4 data11;                                                                                       \
-    vec4 data200;                                                                                      \
-    vec4 data201;                                                                                      \
-    vec4 data21;                                                                                       \
-    LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)));                     \
-    LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2));           \
-    LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4));           \
-    LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)));                     \
-    LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2));           \
-    LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4));            \
-    LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)));                     \
-    LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2));           \
-    LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4));            \
-    data000 = POW2_OP(data000, 4);                                                                     \
-    data001 = POW2_OP(data001, 4);                                                                     \
-    data010 = POW2_OP(data010, 4);                                                                     \
-    data100 = POW2_OP(data100, 4);                                                                     \
-    data101 = POW2_OP(data101, 4);                                                                     \
-    data11  = POW2_OP(data11, 4);                                                                      \
-    data200 = POW2_OP(data200, 4);                                                                     \
-    data201 = POW2_OP(data201, 4);                                                                     \
-    data21  = POW2_OP(data21, 4);                                                                      \
+#define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter)                                            \
+    vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));               \
+    vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));     \
+    vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));     \
+    vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));               \
+    vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));     \
+    vec4 data11  = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));     \
+    vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));               \
+    vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));     \
+    vec4 data21  = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));     \
+    data000      = POW2_OP(data000, 4);                                                                \
+    data001      = POW2_OP(data001, 4);                                                                \
+    data010      = POW2_OP(data010, 4);                                                                \
+    data100      = POW2_OP(data100, 4);                                                                \
+    data101      = POW2_OP(data101, 4);                                                                \
+    data11       = POW2_OP(data11, 4);                                                                 \
+    data200      = POW2_OP(data200, 4);                                                                \
+    data201      = POW2_OP(data201, 4);                                                                \
+    data21       = POW2_OP(data21, 4);                                                                 \
     \
     POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
     POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
@@ -867,334 +816,22 @@
     POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
     POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
     POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw))
+    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
 
-vec2 load_and_unpack(Tensor3D src, uint offset)
-{
-    uint packed_s;
-    vec2 s;
-    LOAD1(packed_s, src, offset);
-
-    s = vec2(unpackHalf2x16(packed_s));
-    return s;
-}
-
-vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
-    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
-
-    int start_x2 = start_x1 + stride_x;
-    int start_y2 = start_y1;
-    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
-    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
-
-    //Initialize maximum
-    vec2 data_max = vec2(0);
-
-    //Load and Set initial maximum1
-    vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2));
-    data_max.x      = data_init1.x;
-
-    //Load and Set initial maximum2
-    if(end_x1 < upper_bound_w)
-    {
-        if((stride_x % 2) == 0)
-        {
-            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2));
-            data_max.y      = data_init2.x;
-        }
-        else
-        {
-            vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2));
-            data_max.y      = data_init2.y;
-        }
-    }
-
-    for(int i = 0; (start_y1 + i) < end_y1; i++)
-        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
-        {
-            //Calculate maximum1
-            if((start_x1 + j + 1) < end_x1)
-            {
-                vec2  data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
-                float data_mr1;
-                POOL_OP_float(data_mr1, data1.x, data1.y);
-                POOL_OP_float(data_max.x, data_max.x, data_mr1);
-            }
-            else
-            {
-                vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
-                POOL_OP_float(data_max.x, data_max.x, data1.x);
-            }
-
-            //Calculate maximum2
-            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
-            {
-                if((stride_x % 2) == 0)
-                {
-                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2)));
-
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        float data_mr2;
-                        POOL_OP_float(data_mr2, data2.x, data2.y);
-                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
-                    }
-                    else
-                    {
-                        POOL_OP_float(data_max.y, data_max.y, data2.x);
-                    }
-                }
-                else
-                {
-                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
-                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        float data_mr2;
-                        POOL_OP_float(data_mr2, data3.x, data2.y);
-                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
-                    }
-                    else
-                    {
-                        POOL_OP_float(data_max.y, data_max.y, data2.y);
-                    }
-                }
-            }
-        }
-    return data_max;
-}
-
-vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
-    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
-
-    int start_x2 = start_x1 + stride_x;
-    int start_y2 = start_y1;
-    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
-    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
-
-    //Initialize sum
-    float data_total1 = float(0);
-    float data_total2 = float(0);
-    for(int i = 0; (start_y1 + i) < end_y1; i++)
-        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
-        {
-            vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-            //Calculate sum1
-            if((start_x1 + j + 1) < end_x1)
-            {
-                data_total1 = data_total1 + data1.x + data1.y;
-            }
-            else
-            {
-                data_total1 = data_total1 + data1.x;
-            }
-
-            //Calculate sum2
-            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
-            {
-                if((stride_x % 2) == 0)
-                {
-                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
-#if defined(POOL_L2)
-                    // Raise to power of 2 for L2 Pooling
-                    data2 = POW2_OP(data2, 2);
-#endif /* defined(POOL_L2) */
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        data_total2 = data_total2 + data2.x + data2.y;
-                    }
-                    else
-                    {
-                        data_total2 = data_total2 + data2.x;
-                    }
-                }
-                else
-                {
-                    vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2)));
-                    vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2)));
-#if defined(POOL_L2)
-                    // Raise to power of 2 for L2 Pooling
-                    data2 = POW2_OP(data2, 2);
-                    data3 = POW2_OP(data3, 2);
-#endif /* defined(POOL_L2) */
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        data_total2 = data_total2 + data3.x + data2.y;
-                    }
-                    else
-                    {
-                        data_total2 = data_total2 + data2.y;
-                    }
-                }
-            }
-        }
-    //Calculate average
-    vec2 data_avg;
-    data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
-    data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
-
-    return data_avg;
-}
-
-#ifdef POOLING_LAYER_2
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
 void main(void)
 {
     // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
-    //Load and calculate data
-    vec2 data;
-    uint res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
-    res = uint(packHalf2x16(data));
-
-    // Store result
-    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
-}
-
-#elif defined(POOLING_LAYER_3)
-/** Performs a pooling function of pool size equal to 3.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
-    //Load and calculate data
-    vec2 data;
-    uint res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
-    res = uint(packHalf2x16(data));
-
-    // Store result
-    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
-}
-
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
     vec4 res;
     // Perform pooling 3x3 for 4 output elements
 #if STRIDE_X == 1
-    POOLING3x3_STRIDE1_fp16(res, src, dst);
+    POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter);
 #elif STRIDE_X == 2
-    POOLING3x3_STRIDE2_fp16(res, src, dst);
+    POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter);
 #elif STRIDE_X == 3
-    POOLING3x3_STRIDE3_fp16(res, src, dst);
+    POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter);
 #endif /*STRIDE_X == 1*/
 
     // Divide by pool region in case of average pooling
@@ -1203,6 +840,10 @@
     int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
     ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
     int   end_y   = min((start_y + 3), MAX_HEIGHT);
+#if defined(EXCLUDE_PADDING)
+    start_x       = max(ivec4(0), start_x);
+    start_y       = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
     res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
 #endif /*POOL_AVG*/
 
@@ -1211,116 +852,30 @@
     res = SQRT_OP(res);
 #endif /* defined(POOL_L2) */
 
-    STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res);
-}
-
-#elif defined(POOLING_LAYER_7)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Supported data types are F16;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
-
-    //Load and calculate data
-    vec2 data;
-    uint res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
-    res = uint(packHalf2x16(data));
-
-    // Store result
-    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+    VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
 }
 
 #elif defined(POOLING_LAYER_N)
-/** Performs a pooling function of pool size equal to N
- *
- * @note Supported data types are F16;
- * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
+
 void main(void)
 {
     // Get pixels pointer
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst);
+    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
-    vec4 vdata00;
-    vdata00 = vec4(INITIAL_VALUE);
-    vec4 vdata01;
-    vdata01 = vec4(INITIAL_VALUE);
-    vec4 vdata10;
-    vdata10 = vec4(INITIAL_VALUE);
-    vec4 vdata11;
-    vdata11 = vec4(INITIAL_VALUE);
-    vec2 sdata;
-    sdata = vec2(INITIAL_VALUE);
+    vec4 vdata00 = vec4(INITIAL_VALUE);
+    vec4 vdata01 = vec4(INITIAL_VALUE);
+    vec4 vdata10 = vec4(INITIAL_VALUE);
+    vec4 vdata11 = vec4(INITIAL_VALUE);
+    vec2 sdata   = vec2(INITIAL_VALUE);
 
     for(int y = 0; y < int(POOL_SIZE); y++)
     {
         int x = 0;
         for(; x <= (int(POOL_SIZE) - 8); x += 8)
         {
-            vec4 data2;
-            vec4 data3;
-            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
-            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2));
+            vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
+            vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2));
 
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
@@ -1335,8 +890,7 @@
         // Leftover
         for(; x < int(POOL_SIZE); x = x + 2)
         {
-            vec2 data4middle;
-            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)));
+            vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
 #if defined(POOL_L2)
             // Raise to power of 2 for L2 Pooling
             data4middle *= data4middle;
@@ -1354,44 +908,91 @@
         }
     }
 
-    for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++)
+    for(int y = 0; y < int(POOL_SIZE); y++)
     {
-        int x1 = STRIDE_X;
-        for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+        if((STRIDE_X % 2) == 0)
         {
-            vec4 data2;
-            vec4 data3;
-            LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
-            LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2));
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data2 *= data2;
-            data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-            POOL_OP(vdata01, vdata01, data2);
-            POOL_OP(vdata11, vdata11, data3);
-        }
-
-        // Leftover
-        for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
-        {
-            vec2 data4middle;
-            data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data4middle *= data4middle;
-#endif /* defined(POOL_L2) */
-            if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+            int x1 = STRIDE_X;
+            for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
             {
-                POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+                vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+                vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data2 *= data2;
+                data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+                POOL_OP(vdata01, vdata01, data2);
+                POOL_OP(vdata11, vdata11, data3);
             }
-            else
+
+            // Leftover
+            for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
             {
-                float data4;
-                POOL_OP_float(data4, data4middle.x, data4middle.y);
-                POOL_OP_float(sdata.y, sdata.y, data4);
+                vec2 data4middle;
+                data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+                if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+                {
+                    POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+                }
+                else
+                {
+                    float data4;
+                    POOL_OP_float(data4, data4middle.x, data4middle.y);
+                    POOL_OP_float(sdata.y, sdata.y, data4);
+                }
+            }
+        }
+        else
+        {
+            vec2 dataorigin2;
+            dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            dataorigin2.y *= dataorigin2.y;
+#endif /* defined(POOL_L2) */
+            POOL_OP_float(sdata.y, sdata.y, dataorigin2.y);
+
+            int x1 = STRIDE_X + 1;
+            for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
+            {
+                vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+                vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
+
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data2 *= data2;
+                data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+                POOL_OP(vdata01, vdata01, data2);
+                POOL_OP(vdata11, vdata11, data3);
+            }
+
+            // Leftover
+            for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
+            {
+                vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
+#if defined(POOL_L2)
+                // Raise to power of 2 for L2 Pooling
+                data4middle *= data4middle;
+#endif /* defined(POOL_L2) */
+                if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
+                {
+                    POOL_OP_float(sdata.y, sdata.y, data4middle.x);
+                }
+                else
+                {
+                    float data4;
+                    POOL_OP_float(data4, data4middle.x, data4middle.y);
+                    POOL_OP_float(sdata.y, sdata.y, data4);
+                }
             }
         }
     }
@@ -1414,14 +1015,20 @@
 #if defined(POOL_AVG) || defined(POOL_L2)
     {
         // Divide by pool region in case of average pooling
-        int  start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
-        int  start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-        int  end_x1   = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
-        int  end_y1   = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
-        int  start_x2 = start_x1 + STRIDE_X;
-        int  start_y2 = start_y1;
-        int  end_x2   = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
-        int  end_y2   = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+        int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * STRIDE_X - PAD_X;
+        int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
+        int end_x1   = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
+        int end_y1   = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
+        int start_x2 = start_x1 + STRIDE_X;
+        int start_y2 = start_y1;
+        int end_x2   = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
+        int end_y2   = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
+#if defined(EXCLUDE_PADDING)
+        start_x1     = max(0, start_x1);
+        start_y1     = max(0, start_y1);
+        start_x2     = max(0, start_x2);
+        start_y2     = max(0, start_y2);
+#endif /* defined(EXCLUDE_PADDING) */
         vec2 res1;
         res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
         res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
@@ -1434,11 +1041,12 @@
     // Take square root of the result in L2 pooling
     data = SQRT_OP(data);
 #endif /* defined(POOL_L2) */
-    uint res;
-    res = uint(packHalf2x16(data));
 
     // Store result
-    STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res);
+    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
 }
-#endif /*POOLING_LAYER_2*/
-#endif /*DATA_TYPE_FP32 */
+#endif // POOLING_LAYER_N
+
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32

diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
new file mode 100644
index 0000000..b2689a2
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs

@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
+
+#include "helpers_cs.h"
+
+// We DO have to use highp for DATA_TYPE_FP16 float here to calculate the coordinates of source tensor. float is highp by default, but we still write it down here to make it more clearly, and mediump is only used for src/dst tensor in shader body.
+precision highp float;
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
+ *
+ * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: FP16.
+ * @param[in]  src_attrs    The attributes of the source tensor
+ * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: FP16. (Must be the same as the input)
+ * @param[in]  dst_attrs    The attributes of the destination tensor
+ * @param[in]  input_width  Input image width
+ * @param[in]  input_height Input image height
+ * @param[in]  scale        The scale factor along x/y dimension
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
+    float           input_width;
+    float           input_height;
+    vec2            scale;
+};
+
+#if defined(DATA_TYPE_FP16)
+#if defined(SCALE_NEAREST_GENERIC)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+
+vec4[2] transform_nearest(vec2 coord, vec2 scale)
+{
+    vec4 in_x_coords = vec4(coord.x, 1.f + coord.x, 2.f + coord.x, 3.f + coord.x);
+
+    vec4[2] t;
+    t[0] = (in_x_coords + (vec4(0.5f))) * scale.x;
+    t[1] = vec4((coord.y + 0.5f) * scale.y);
+
+    return t;
+}
+
+vec4[2] clamp_to_border_with_size(vec4[2] coords, float width, float height, float border_size)
+{
+    vec4[2] c;
+    c[0] = clamp(coords[0], 0.0f - border_size, width - 1.f + border_size);
+    c[1] = clamp(coords[1], 0.0f - border_size, height - 1.f + border_size);
+
+    return c;
+}
+
+void main()
+{
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    vec4[2] tc = clamp_to_border_with_size(transform_nearest(vec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y), scale), input_width, input_height, float(BORDER_SIZE));
+
+    mediump vec2 s = vec2(0.0f);
+    mediump vec4 d = vec4(0.0f);
+
+    for(int i = 0; i < 4; i++)
+    {
+        uint offset_in_bytes = image_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]));
+
+        s = LOAD_UNPACK2_HALF(src_ptr, uint(offset_in_bytes >> src_shift));
+
+        if(offset_in_bytes % uint(4) == uint(0))
+        {
+            d[i] = s.x;
+        }
+        else
+        {
+            d[i] = s.y;
+        }
+    }
+
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, d);
+}
+#elif defined(SCALE_NEAREST_8X) /* SCALE_NEAREST_GENERIC */
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+
+void main()
+{
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    uvec2 tc = uvec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y >> uint(1));
+
+    mediump vec4 s = vec4(0.0f);
+    mediump      vec4[2] d;
+
+    s = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, int(tc[0]), int(tc[1])));
+
+    d[0] = vec4(s.x, s.x, s.y, s.y);
+    d[1] = vec4(s.z, s.z, s.w, s.w);
+
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, d);
+}
+#endif                          /* SCALE_NEAREST_GENERIC */
+
+#else /* DATA_TYPE_FP16 */
+#error Data type not supported
+#endif /* DATA_TYPE_FP16 */

diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index c9fabc5..6967736 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs

@@ -62,7 +62,7 @@
 
 #if defined(DATA_TYPE_FP32)
 
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
 TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
 
 void main(void)
@@ -77,19 +77,23 @@
     uint width3 = width >> 3;
     for(int i = 0; i < int(width3); i++)
     {
-        vec4 data[2];
-        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
-        max_val = MAX_OP(data[0], max_val);
-        max_val = MAX_OP(data[1], max_val);
+        vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        max_val      = MAX_OP(data[0], max_val);
+        max_val      = MAX_OP(data[1], max_val);
     }
 
 #ifdef NON_MULTIPLE_OF_8
     // Handle non multiple of 8
-    for(int i = int(width3 << 3); i < int(width); i++)
+    vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
+    int  idx     = 0;
+    if(width >> 2 != width3 << 1)
     {
-        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
-        max_val.x  = MAX_OP(data, max_val.x);
+        max_val = MAX_OP(data[0], max_val);
+        idx     = 1;
+    }
+    for(int i = 0; i < int(width) % 4; i++)
+    {
+        max_val.x = MAX_OP(data[idx][i], max_val.x);
     }
 #endif /* NON_MULTIPLE_OF_8 */
 
@@ -102,7 +106,7 @@
 }
 #elif defined(DATA_TYPE_FP16)
 
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
 TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
 
 void main(void)
@@ -117,24 +121,23 @@
     uint width3 = width >> 3;
     for(int i = 0; i < int(width3); i++)
     {
-        vec4 data[2];
-        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        max_val = MAX_OP(data[0], max_val);
-        max_val = MAX_OP(data[1], max_val);
+        vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        max_val      = MAX_OP(data[0], max_val);
+        max_val      = MAX_OP(data[1], max_val);
     }
 
 #ifdef NON_MULTIPLE_OF_8
     // Handle non multiple of 8
-    uint width1 = width >> 1 << 1;
-    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
+    vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
+    int  idx     = 0;
+    if(width >> 2 != width3 << 1)
     {
-        vec2 data  = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
-        max_val.xy = MAX_OP(data, max_val.xy);
+        max_val = MAX_OP(data[0], max_val);
+        idx     = 1;
     }
-    if(width != width1)
+    for(int i = 0; i < int(width) % 4; i++)
     {
-        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0));
-        max_val.x = MAX_OP(data.x, max_val.x);
+        max_val.x = MAX_OP(data[idx][i], max_val.x);
     }
 #endif /* NON_MULTIPLE_OF_8 */
 
@@ -175,9 +178,9 @@
 };
 #if defined(DATA_TYPE_FP32)
 
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
 TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, dstBuffer, vec4[2], dst_ptr, dst_shift, 5, writeonly);
 TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
 
 void main(void)
@@ -198,28 +201,34 @@
     for(int i = 0; i < int(width3); i++)
     {
         vec4 data[2];
-        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
+        data    = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
         data[0] = SUB_OP(data[0], max_val);
         data[1] = SUB_OP(data[1], max_val);
         data[0] = EXP_OP(data[0]);
         data[1] = EXP_OP(data[1]);
-        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data[0]);
-        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, (i << 3) + 4, 0), data[1]);
+        STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
         sum1D = ADD_OP(sum1D, data[0]);
         sum1D = ADD_OP(sum1D, data[1]);
     }
 
 #ifdef NON_MULTIPLE_OF_8
     // Handle non multiple of 8
-    for(int i = int(width3 << 3); i < int(width); i++)
+    vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
+    int  idx     = 0;
+    if(width >> 2 != width3 << 1)
     {
-        float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
-        data       = SUB_OP(data, max_val.x);
-        data       = EXP_OP(data);
-        STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
-        sum1D.x = ADD_OP(sum1D.x, data);
+        data[0] = SUB_OP(data[0], max_val);
+        data[0] = EXP_OP(data[0]);
+        sum1D   = ADD_OP(sum1D, data[0]);
+        idx     = 1;
     }
+    for(int i = 0; i < int(width) % 4; i++)
+    {
+        data[idx][i] = SUB_OP(data[idx][i], max_val.x);
+        data[idx][i] = EXP_OP(data[idx][i]);
+        sum1D.x      = ADD_OP(sum1D.x, data[idx][i]);
+    }
+    STORE(dst_ptr, IMAGE_OFFSET(dst_iter, width3 << 3, 0), data);
 #endif /* NON_MULTIPLE_OF_8 */
 
     // Perform min/max reduction
@@ -231,9 +240,9 @@
 }
 #elif defined(DATA_TYPE_FP16)
 
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
 TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
 
 void main(void)
@@ -254,36 +263,34 @@
     uint width3 = width >> 3;
     for(int i = 0; i < int(width3); i++)
     {
-        vec4 data[2];
-        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        data[0] = SUB_OP(data[0], max_val);
-        data[1] = SUB_OP(data[1], max_val);
-        data[0] = EXP_OP(data[0]);
-        data[1] = EXP_OP(data[1]);
-        VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
+        vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[0]      = SUB_OP(data[0], max_val);
+        data[1]      = SUB_OP(data[1], max_val);
+        data[0]      = EXP_OP(data[0]);
+        data[1]      = EXP_OP(data[1]);
+        STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
         sum1D = ADD_OP(sum1D, data[0]);
         sum1D = ADD_OP(sum1D, data[1]);
     }
 
 #ifdef NON_MULTIPLE_OF_8
     // Handle non multiple of 8
-    uint width1 = width >> 1 << 1;
-    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
+    vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
+    int  idx     = 0;
+    if(width >> 2 != width3 << 1)
     {
-        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
-        data      = SUB_OP(data, max_val.xy);
-        data      = EXP_OP(data);
-        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
-        sum1D.xy = ADD_OP(sum1D.xy, data);
+        data[0] = SUB_OP(data[0], max_val);
+        data[0] = EXP_OP(data[0]);
+        sum1D   = ADD_OP(sum1D, data[0]);
+        idx     = 1;
     }
-    if(width != width1)
+    for(int i = 0; i < int(width) % 4; i++)
     {
-        float data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0)).x;
-        data       = SUB_OP(data, max_val.x);
-        data       = EXP_OP(data);
-        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, width1, 0), vec2(data, 0.0));
-        sum1D.x = ADD_OP(sum1D.x, data);
+        data[idx][i] = SUB_OP(data[idx][i], max_val.x);
+        data[idx][i] = EXP_OP(data[idx][i]);
+        sum1D.x      = ADD_OP(sum1D.x, data[idx][i]);
     }
+    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, width3 << 3, 0), data);
 #endif /* NON_MULTIPLE_OF_8 */
     // Perform min/max reduction
     sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
@@ -315,9 +322,9 @@
     Tensor3DAttributes dst_attrs;
 };
 #if defined(DATA_TYPE_FP32)
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
 TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, dstBuffer, vec4[2], dst_ptr, dst_shift, 5, writeonly);
 void main(void)
 {
     ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
@@ -327,16 +334,15 @@
     // Load max value of 1D logits vector (row)
     vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
 
-    vec4 data[2];
-    data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-    data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), DIV_OP(data[0], sum_val));
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 4, 0), DIV_OP(data[1], sum_val));
+    vec4 data[2] = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+    data[0]      = DIV_OP(data[0], sum_val);
+    data[1]      = DIV_OP(data[1], sum_val);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, data);
 }
 #elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
 TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 void main(void)
 {
     ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
@@ -346,12 +352,10 @@
     // Load max value of 1D logits vector (row)
     vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
 
-    vec4 data[2];
-    data = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-    vec4 ret[2];
-    ret[0] = DIV_OP(data[0], sum_val);
-    ret[1] = DIV_OP(data[1], sum_val);
-    VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), ret);
+    vec4 data[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
+    data[0]      = DIV_OP(data[0], sum_val);
+    data[1]      = DIV_OP(data[1], sum_val);
+    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
 }
 #else // DATA_TYPE_FP32
 #error Data type not supported

diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
old mode 100644
new mode 100755
index f8ad303..89bf9fb
--- a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,12 @@
  * SOFTWARE.
  */
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+
+#include "helpers_cs.h"
+
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
 
 #define SWAP_ROW_func(u0, l0) \
     {                         \
@@ -59,144 +64,90 @@
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
  * @note Optimization name must be passed using "#define OPTIMIZATION_NAME" for F16. e.g. "#define TRANSPOSE_8X8"
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32/F16
+ * @param[in]  src_attrs The attributes of the source matrix
+ * @param[out] dst_ptr   Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_attrs The attributes of the destination matrix
  */
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
 {
-    IMAGE_PARAM_DECLARATION(src);
-    IMAGE_PARAM_DECLARATION(dst);
+    ImageAttributes src_attrs;
+    ImageAttributes dst_attrs;
 };
 
 #ifdef DATA_TYPE_FP32
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-
-#define LOAD16(r, name, offset)              \
-    {                                        \
-        r.x = LOAD4(name, offset);           \
-        r.y = LOAD4(name, offset + uint(1)); \
-        r.z = LOAD4(name, offset + uint(2)); \
-        r.w = LOAD4(name, offset + uint(3)); \
-    }
-
-#define STORE16(name, offset, r)             \
-    {                                        \
-        STORE4(name, offset, r.x);           \
-        STORE4(name, offset + uint(1), r.y); \
-        STORE4(name, offset + uint(2), r.z); \
-        STORE4(name, offset + uint(3), r.w); \
-    }
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
 
 void main(void)
 {
     // compute source address
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
     // load the NxN block at (x, y)
-    vec4 u0;
-    vec4 u1;
-    vec4 u2;
-    vec4 u3;
-    LOAD16(u0, src, offset(src, 0, 0));
-    LOAD16(u1, src, offset(src, 0, 1));
-    LOAD16(u2, src, offset(src, 0, 2));
-    LOAD16(u3, src, offset(src, 0, 3));
+    vec4 u0 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    vec4 u1 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+    vec4 u2 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+    vec4 u3 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
 
     // transpose the block
     TRANSPOSE_4x4_func(u0, u1, u2, u3);
 
     // store the block at (y, x)
-    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst.stride_y) + (dst.offset_first_element_in_bytes);
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst_attrs.stride_y));
 
-    STORE16(dst, uint((dst_offset_in_bytes + uint(0) * dst.stride_y) >> 2), u0);
-    STORE16(dst, uint((dst_offset_in_bytes + uint(1) * dst.stride_y) >> 2), u1);
-    STORE16(dst, uint((dst_offset_in_bytes + uint(2) * dst.stride_y) >> 2), u2);
-    STORE16(dst, uint((dst_offset_in_bytes + uint(3) * dst.stride_y) >> 2), u3);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), u0);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), u1);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), u2);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), u3);
 }
 
-#elif defined(DATA_TYPE_FP16) /* DATA_TYPE_FP32 */
-precision mediump float;
+#elif defined(DATA_TYPE_FP16) /* DATA_TYPE_FP16 */
 
 #if defined(TRANSPOSE_4X4)
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
 
 void main(void)
 {
     // compute source address
-    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
     // load the NxN block at (x, y)
-    vec4  u0;
-    vec4  u1;
-    vec4  u2;
-    vec4  u3;
-    uvec2 packed_s[4];
-
-    GC_LOAD1_2D_OFFSET(packed_s[0], src, 0, 0);
-    GC_LOAD1_2D_OFFSET(packed_s[1], src, 0, 1);
-    GC_LOAD1_2D_OFFSET(packed_s[2], src, 0, 2);
-    GC_LOAD1_2D_OFFSET(packed_s[3], src, 0, 3);
-
-    u0 = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    u1 = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-    u2 = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
-    u3 = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+    vec4 u0 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    vec4 u1 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+    vec4 u2 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+    vec4 u3 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
 
     // transpose the block
     TRANSPOSE_4x4_func(u0, u1, u2, u3);
 
     // store the block at (y, x)
-    uint dst_offset_in_bytes = uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
-    dst.current_offset       = dst_offset_in_bytes;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
 
-    packed_s[0] = uvec2(packHalf2x16(u0.xy), packHalf2x16(u0.zw));
-    packed_s[1] = uvec2(packHalf2x16(u1.xy), packHalf2x16(u1.zw));
-    packed_s[2] = uvec2(packHalf2x16(u2.xy), packHalf2x16(u2.zw));
-    packed_s[3] = uvec2(packHalf2x16(u3.xy), packHalf2x16(u3.zw));
-
-    GC_STORE1_2D_OFFSET(packed_s[0], dst, 0, 0);
-    GC_STORE1_2D_OFFSET(packed_s[1], dst, 0, 1);
-    GC_STORE1_2D_OFFSET(packed_s[2], dst, 0, 2);
-    GC_STORE1_2D_OFFSET(packed_s[3], dst, 0, 3);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), u0);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), u1);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), u2);
+    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), u3);
 }
 
-#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
-
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_8X8 */
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main(void)
 {
     // compute source address
-    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
-    vec4  u[8][2];
-    uvec4 packed_s[8];
+    vec4 u[8][2];
 
     for(int i = 0; i < 8; i++)
     {
-        GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
-        u[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
-        u[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+        u[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
     }
 
     // transpose the block
@@ -207,41 +158,36 @@
     SWAP_4x4_func(u[0][1], u[1][1], u[2][1], u[3][1], u[4][0], u[5][0], u[6][0], u[7][0]);
 
     // store the block at (y, x)
-    uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
-    dst.current_offset       = dst_offset_in_bytes;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
 
     for(int i = 0; i < 8; i++)
     {
-        packed_s[i] = uvec4(packHalf2x16(u[i][0].xy), packHalf2x16(u[i][0].zw), packHalf2x16(u[i][1].xy), packHalf2x16(u[i][1].zw));
-        GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+        STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u[i]);
     }
 }
 
-#elif defined(TRANSPOSE_8X8_SQUARE) /* TRANSPOSE_4X4 */
-
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
+#elif defined(TRANSPOSE_8X8_SQUARE) /* TRANSPOSE_8x8_SQUARE */
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
 
 void main(void)
 {
-    Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst);
+    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
 
     if(gl_GlobalInvocationID.x <= gl_GlobalInvocationID.y)
     {
-        uint blk1_offset_in_bytes = src.current_offset;
-        uint blk2_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes);
+        uint blk1_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(src_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
+        uint blk2_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(dst_iter);
 
         // load block1
-        vec4  u1[8][2];
-        uvec4 packed_s[8];
+        vec4 u1[8][2];
 
-        src.current_offset = blk1_offset_in_bytes;
+        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, blk1_offset_in_bytes);
         for(int i = 0; i < 8; i++)
         {
-            GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
-            u1[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
-            u1[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+            u1[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
         }
 
         // transpose block1
@@ -252,22 +198,19 @@
         SWAP_4x4_func(u1[0][1], u1[1][1], u1[2][1], u1[3][1], u1[4][0], u1[5][0], u1[6][0], u1[7][0]);
 
         // write to block2
-        dst.current_offset = blk2_offset_in_bytes;
+        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(dst_iter, blk2_offset_in_bytes);
         for(int i = 0; i < 8; i++)
         {
-            packed_s[i] = uvec4(packHalf2x16(u1[i][0].xy), packHalf2x16(u1[i][0].zw), packHalf2x16(u1[i][1].xy), packHalf2x16(u1[i][1].zw));
-            GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+            STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u1[i]);
         }
 
         // load block2
         vec4 u2[8][2];
 
-        src.current_offset = blk2_offset_in_bytes;
+        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, blk2_offset_in_bytes);
         for(int i = 0; i < 8; i++)
         {
-            GC_LOAD1_2D_OFFSET(packed_s[i], src, 0, i);
-            u2[i][0] = vec4(unpackHalf2x16(packed_s[i].x), unpackHalf2x16(packed_s[i].y));
-            u2[i][1] = vec4(unpackHalf2x16(packed_s[i].z), unpackHalf2x16(packed_s[i].w));
+            u2[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
         }
 
         // transpose block2
@@ -278,11 +221,10 @@
         SWAP_4x4_func(u2[0][1], u2[1][1], u2[2][1], u2[3][1], u2[4][0], u2[5][0], u2[6][0], u2[7][0]);
 
         // write to block1
-        dst.current_offset = blk1_offset_in_bytes;
+        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(dst_iter, blk1_offset_in_bytes);
         for(int i = 0; i < 8; i++)
         {
-            packed_s[i] = uvec4(packHalf2x16(u2[i][0].xy), packHalf2x16(u2[i][0].zw), packHalf2x16(u2[i][1].xy), packHalf2x16(u2[i][1].zw));
-            GC_STORE1_2D_OFFSET(packed_s[i], dst, 0, i);
+            STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u2[i]);
         }
     }
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000..caec324
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp

@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
+
+    // Validate in case of configured output
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+
+    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
+
+    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
+                                                       input2->valid_region());
+
+    output_access.set_valid_region(win, valid_region);
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+GCArithmeticAdditionKernel::GCArithmeticAdditionKernel()
+    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void GCArithmeticAdditionKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+
+    // Auto initialize output if not initialized
+    {
+        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
+        set_format_if_unknown(*output->info(), Format::F16);
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
+
+    _input1 = input1;
+    _input2 = input2;
+    _output = output;
+
+    // Set kernel build options
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IGCKernel::configure(win_config.second);
+}
+
+Status GCArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
+
+    return Status{};
+}
+
+void GCArithmeticAdditionKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_2D();
+    do
+    {
+        unsigned int idx     = 0;
+        unsigned int binding = 1; // SSBO binding starts from 1.
+        add_2D_tensor_argument(idx, _input1, binding++, slice);
+        add_2D_tensor_argument(idx, _input2, binding++, slice);
+        add_2D_tensor_argument(idx, _output, binding++, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_2D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index 492f708..af1e34e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -43,31 +44,50 @@
 void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
                                std::pair<unsigned int, unsigned int> convolved_dims)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    TensorShape output_shape = input->info()->tensor_shape();
+    output_shape.set(0, convolved_dims.first);
+    output_shape.set(1, convolved_dims.second);
+    output_shape.set(2, input->info()->tensor_shape()[0]);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     _input          = input;
     _output         = output;
     _convolved_dims = convolved_dims;
 
+    unsigned int num_elems_processed_per_iteration = 1;
+
     // Create kernel
-    std::set<std::string>  build_opts;
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define WIDTH_OUTPUT " + support::cpp11::to_string(_convolved_dims.first));
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace(("#define " + dt_name));
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.insert("#define COL2IM");
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(num_elems_processed_per_iteration));
+
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
 
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
-    _kernel.set_argument(idx++, _convolved_dims.first);
-
     // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
+    unsigned int nums = 2;
+    Window       win  = calculate_max_window(*output->info(), Steps(nums));
 
-    // The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    AccessWindowHorizontal output_access(output->info(), 0, 2);
+    const int              input_padding = ceil_to_multiple(input->info()->dimension(0), 2) - input->info()->dimension(0);
+
+    AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + input_padding, input->info()->dimension(1) + 1);
+
+    update_window_and_padding(win, input_access,
+                              output_access);
+
+    output_access.set_valid_region(win, output->info()->valid_region());
 
     IGCKernel::configure(win);
 }
@@ -77,20 +97,25 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
 
-    Window slice_in  = window.first_slice_window_2D();
-    Window slice_out = window.first_slice_window_3D();
-
     _kernel.use();
 
+    Window collapsed_window = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+    Window slice            = collapsed_window.first_slice_window_3D();
+
+    // Set static kernel arguments
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    //_kernel.set_argument(idx++, _output->info()->strides_in_bytes()[3]);
+    _kernel.set_argument(idx++, uint(_output->info()->dimension(2)));
+    _kernel.set_argument(idx++, _input->info()->strides_in_bytes()[2]);
+
     do
     {
         // Set inputs
-        unsigned int idx     = 0;
-        unsigned int binding = 1;
-        add_2D_tensor_argument(idx, _input, binding++, slice_in);
-        add_3D_tensor_argument(idx, _output, binding++, slice_out);
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
         _kernel.update_shader_params();
-        enqueue(*this, slice_in);
+        enqueue(*this, slice);
     }
-    while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+    while(collapsed_window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index a611178..7b1848c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,7 +106,7 @@
     AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     IGCKernel::configure(win);
 }
@@ -122,18 +122,9 @@
 
     do
     {
-        if(_input->info()->data_type() == DataType::F32)
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, 1, slice);
-            add_3D_tensor_argument(idx, _output, 2, slice);
-        }
-        else if(_input->info()->data_type() == DataType::F16)
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice);
-            add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
-        }
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
 

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
new file mode 100644
index 0000000..28b5bd2
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+
+using namespace arm_compute;
+
+namespace
+{
+/** Calculates expected output shape dimension
+ *
+ * @param[in] Input shape
+ *
+ * @return Expected output shape
+ */
+TensorShape get_output_shape(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info)
+{
+    unsigned int output_width  = 0;
+    unsigned int output_height = 0;
+
+    std::tie(output_width, output_height) = scaled_dimensions(input_shape.x(), input_shape.y(), weights_shape.x(), weights_shape.y(), conv_info);
+
+    TensorShape output_shape = input_shape;
+    output_shape.set(0, output_width);
+    output_shape.set(1, output_height);
+
+    return output_shape;
+}
+} // namespace
+
+GCDepthwiseConvolutionLayer3x3Kernel::GCDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0), _lws(gles::NDRange(1U, 1U, 1U))
+{
+}
+
+BorderSize GCDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    // Get convolved dimensions
+    TensorShape output_shape = get_output_shape(input->info()->tensor_shape(), weights->info()->tensor_shape(), conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position());
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input         = input;
+    _output        = output;
+    _weights       = weights;
+    _biases        = biases;
+    _conv_stride_x = conv_info.stride().first;
+    _conv_stride_y = conv_info.stride().second;
+    _conv_pad_left = conv_info.pad_left();
+    _conv_pad_top  = conv_info.pad_top();
+    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
+
+    // Set build options
+    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
+    std::set<std::string> options;
+
+    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
+    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
+    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
+    options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
+    options.emplace("#define STRIDE_Y " + support::cpp11::to_string(_conv_stride_y));
+
+    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    options.emplace(("#define " + dt_name));
+
+    unsigned int num_elems_read_per_iteration_x    = 8;
+    unsigned int num_elems_read_per_iteration_y    = 1;
+    unsigned int num_elems_written_per_iteration_x = 4;
+    unsigned int num_elems_written_per_iteration_y = 1;
+    unsigned int num_elems_written_per_iteration_z = 1;
+
+    if((_conv_stride_x == 1) && (_conv_stride_y == 1))
+    {
+        switch(input->info()->data_type())
+        {
+#define PROCESS_4X_3Y_1Z
+
+            case DataType::F16:
+#if defined(PROCESS_4X_3Y_1Z)
+                options.emplace("#define PROCESS_4X_3Y_1Z");
+                num_elems_read_per_iteration_y    = 5;
+                num_elems_written_per_iteration_y = 3;
+#endif /* PROCESS_4X_3Y_1Z */
+#undef PROCESS_4X_3Y_1Z
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+    }
+    else
+    {
+        switch(input->info()->data_type())
+        {
+            case DataType::F16:
+                options.emplace("#define PROCESS_4X_1Y_1Z");
+                break;
+
+            default:
+                ARM_COMPUTE_ERROR("Current data type is not supported");
+                break;
+        }
+    }
+
+    if(_biases != nullptr)
+    {
+        options.emplace("#define BIAS");
+    }
+
+    // Create kernel
+    std::string kernel_name = "depthwise_convolution_3x3";
+    _kernel                 = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, options));
+
+    // Calculate output right and bottom border
+    const int output_width          = output->info()->dimension(0);
+    const int output_height         = output->info()->dimension(1);
+    const int output_padding_right  = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
+    const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
+
+    // Calculate input right and bottom border
+    const int input_width    = input->info()->dimension(0);
+    const int input_height   = input->info()->dimension(1);
+    const int padding_right  = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + 2), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_left - input_width;
+    const int padding_bottom = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + 2), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_top - input_height;
+
+    BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
+
+    Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
+
+    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + padding_right, input_height + padding_bottom);
+    AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
+    AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
+
+    switch(weights->info()->data_type())
+    {
+        case DataType::F16:
+            weights_access = AccessWindowStatic(weights->info(), 0, 0, 4, 3);
+            if(_biases != nullptr)
+            {
+                bias_access = AccessWindowStatic(_biases->info(), 0, 0, _biases->info()->dimension(0) + 1, 1);
+            }
+            break;
+
+        default:
+            ARM_COMPUTE_ERROR("Current data type is not supported");
+            break;
+    }
+
+    AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+
+    if(_biases != nullptr)
+    {
+        update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
+    }
+    else
+    {
+        update_window_and_padding(win, input_access, weights_access, output_access);
+    }
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    IGCKernel::configure(win);
+}
+
+void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    // Create input window and adjust
+    Window win_in = window;
+    win_in.adjust(Window::DimX, -_conv_pad_left, true);
+    win_in.adjust(Window::DimY, -_conv_pad_top, true);
+    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
+    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
+
+    Window slice_in      = win_in.first_slice_window_3D();
+    Window slice_out     = window.first_slice_window_3D();
+    Window slice_weights = window.first_slice_window_3D();
+    slice_weights.set_dimension_step(Window::DimX, 0);
+    slice_weights.set_dimension_step(Window::DimY, 0);
+
+    // Set biases
+    if(_biases != nullptr)
+    {
+        unsigned int idx = 3 * num_arguments_per_3D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
+        add_1D_tensor_argument(idx, _biases, 4, slice_biases);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice_out);
+        add_3D_tensor_argument(idx, _weights, 3, slice_weights);
+
+        _kernel.update_shader_params();
+        enqueue(*this, slice_out, _lws);
+    }
+    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index ca673ea..abfe5cc 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,6 +62,8 @@
     if(bias != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        // FIXME: Bug in framework, workaround it in tests currently.
+        //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
         ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
     }
 
@@ -117,43 +119,43 @@
         {
             switch(input->info()->data_type())
             {
-#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-
                 case DataType::F16:
-#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+#define PROCESS_4X_3Y_1Z
+
+#if defined(PROCESS_8X_3Y_1Z)
+                    options.emplace("#define PROCESS_8X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 16;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 8;
                     num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_1Z)
+                    options.emplace("#define PROCESS_4X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+#elif defined(PROCESS_4X_4Y_1Z)
+                    options.emplace("#define PROCESS_4X_4Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 6;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_2Z)
+                    options.emplace("#define PROCESS_4X_3Y_2Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
                     num_elems_written_per_iteration_y = 3;
                     num_elems_written_per_iteration_z = 2;
-#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
-#undef PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_8X_3Y_1Z
+#undef PROCESS_4X_3Y_1Z
+#undef PROCESS_4X_4Y_1Z
+#undef PROCESS_4X_3Y_2Z
                     break;
 
                 case DataType::F32:
-                    options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+                    options.emplace("#define PROCESS_4X_3Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_read_per_iteration_y    = 5;
                     num_elems_written_per_iteration_x = 4;
@@ -170,32 +172,32 @@
             switch(input->info()->data_type())
             {
                 case DataType::F16:
-                    options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+                    options.emplace("#define PROCESS_4X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_written_per_iteration_x = 4;
                     break;
 
                 case DataType::F32:
-#define PROCESS_4_ELEMENT
+#define PROCESS_4X_1Y_1Z
 
-#if defined(PROCESS_1_ELEMENT)
-                    options.emplace("#define PROCESS_1_ELEMENT");
+#if defined(PROCESS_1X_1Y_1Z)
+                    options.emplace("#define PROCESS_1X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 3;
                     num_elems_written_per_iteration_x = 1;
-#elif defined(PROCESS_4_ELEMENT)
-                    options.emplace("#define PROCESS_4_ELEMENT");
+#elif defined(PROCESS_4X_1Y_1Z)
+                    options.emplace("#define PROCESS_4X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 8;
                     num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_8_ELEMENT)
-                    options.emplace("#define PROCESS_8_ELEMENT");
+#elif defined(PROCESS_8X_1Y_1Z)
+                    options.emplace("#define PROCESS_8X_1Y_1Z");
                     num_elems_read_per_iteration_x    = 12;
                     num_elems_written_per_iteration_x = 8;
-#else /* PROCESS_1_ELEMENT */
+#else /* PROCESS_nX_nY_nZ */
 #error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_1_ELEMENT */
-#undef PROCESS_1_ELEMENT
-#undef PROCESS_4_ELEMENT
-#undef PROCESS_8_ELEMENT
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_1X_1Y_1Z
+#undef PROCESS_4X_1Y_1Z
+#undef PROCESS_8X_1Y_1Z
                     break;
 
                 default:
@@ -392,69 +394,21 @@
     Window slice_in = win_in.first_slice_window_3D();
 
     unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
-    add_3D_tensor_argument(idx1, _weights, BufferParam(3, 2), slice);
+    add_3D_tensor_argument(idx1, _weights, 3, slice);
 
     if(_bias != nullptr)
     {
         Window slice_bias;
         slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, _bias, BufferParam(4, 2), slice_bias);
+        add_1D_tensor_argument(idx1, _bias, 4, slice_bias);
     }
 
     do
     {
         unsigned int idx = 0;
 
-        switch(_input->info()->data_type())
-        {
-            case DataType::F16:
-                switch(kernel_size)
-                {
-                    case 1:
-                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
-                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
-                        break;
-
-                    case 3:
-                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
-                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
-                        break;
-
-                    case 5:
-                        add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in);
-                        add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice);
-                        break;
-
-                    default:
-                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
-                        break;
-                }
-                break;
-
-            case DataType::F32:
-                switch(kernel_size)
-                {
-                    case 1:
-                    case 5:
-                        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice_in);
-                        add_3D_tensor_argument(idx, _output, BufferParam(2, 2), slice);
-                        break;
-
-                    case 3:
-                        add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in);
-                        add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice);
-                        break;
-
-                    default:
-                        ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size);
-                        break;
-                }
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
+        add_3D_tensor_argument(idx, _input, 1, slice_in);
+        add_3D_tensor_argument(idx, _output, 2, slice);
 
         _kernel.update_shader_params();
         enqueue(*this, slice, _lws);

diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
index e87c902..8886b84 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -96,9 +96,9 @@
     {
         unsigned int idx = 0;
 
-        add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
-        add_3D_tensor_argument(idx, _mask, BufferParam(2, 2), slice);
-        add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
+        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _mask, 2, slice);
+        add_3D_tensor_argument(idx, _output, 3, slice);
 
         _kernel.update_shader_params();
         enqueue(*this, slice);

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index 4bc6731..dc86bfb 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,7 @@
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
+using namespace arm_compute::gles_compute;
 
 GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
     : _input(nullptr), _output(nullptr)
@@ -43,7 +44,7 @@
 
 void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape output_shape = input->info()->tensor_shape();

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index 944585d..6d856e9 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,26 +117,9 @@
     {
         // Set arguments
         unsigned int idx = 0;
-        if(_accum->info()->data_type() == DataType::F32)
-        {
-            add_2D_tensor_argument(idx, _accum, 1, accum_slice);
-            add_1D_tensor_argument(idx, _biases, 2, biases_slice);
-        }
-        else if(_accum->info()->data_type() == DataType::F16)
-        {
-#if defined(ACCUM_PROCESS_4X)
-            BufferParam param = { 1, 3 };
-            add_2D_tensor_argument(idx, _accum, param, accum_slice);
-            param.binding_point = 2;
-            add_1D_tensor_argument(idx, _biases, param, biases_slice);
-#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
-            BufferParam param             = { 1, 4 };
-            add_2D_tensor_argument(idx, _accum, param, accum_slice);
-            param.binding_point = 2;
-            add_1D_tensor_argument(idx, _biases, param, biases_slice);
-#endif                          /* ACCUM_PROCESS_4X */
-        }
 
+        add_2D_tensor_argument(idx, _accum, 1, accum_slice);
+        add_1D_tensor_argument(idx, _biases, 2, biases_slice);
         _kernel.update_shader_params();
 
         enqueue(*this, accum_slice, _lws);

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index cf5d378..43846dc 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp

@@ -34,6 +34,7 @@
 #include "arm_compute/core/Window.h"
 
 using namespace arm_compute;
+using namespace arm_compute::gles_compute;
 
 GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
     : _input(nullptr), _output(nullptr)

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index 8179525..a5f09e8 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@
 #include <string>
 
 using namespace arm_compute;
+using namespace arm_compute::gles_compute;
 
 GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
@@ -105,7 +106,7 @@
 
         update_window_and_padding(win, input0_access, input1_access, output_access);
 
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
     }
     else
     {
@@ -200,35 +201,10 @@
         }
 
         unsigned int idx = 0;
-        switch(_input0->info()->data_type())
-        {
-            case DataType::F16:
-#if defined(MM_PROCESS_4X)
-                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
-                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
-                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
-#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */
-                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
-                add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b);
-                add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice);
-#elif defined(MM_PROCESS_8X)           /* MM_PROCESS_4X */
-                add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice);
-                add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b);
-                add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice);
-#endif                                 /* MM_PROCESS_4X */
-                break;
 
-            case DataType::F32:
-                add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice);
-                add_2D_tensor_argument(idx, _input1, BufferParam(2, 2), slice_b);
-                add_2D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-
+        add_2D_tensor_argument(idx, _input0, 1, slice);
+        add_2D_tensor_argument(idx, _input1, 2, slice_b);
+        add_2D_tensor_argument(idx, _output, 3, slice);
         _kernel.update_shader_params();
         enqueue(*this, slice);
     }

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
index c361b60..5d9f9c2 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@
 
 void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
 
     TensorShape  output_shape{ input->info()->tensor_shape() };
@@ -90,7 +90,7 @@
 
     update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape()));
 
     IGCKernel::configure(win);
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index e849891..4ab6f3e 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
@@ -30,6 +31,7 @@
 #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
@@ -39,20 +41,40 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
 GCIm2ColKernel::GCIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+    : _input(nullptr), _output(nullptr), _convolved_dims(), _kernel_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
 {
 }
 
-void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_UNUSED(kernel_dims);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
+    // Create kernel
     std::set<std::string> build_opts;
     std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
@@ -65,48 +87,52 @@
         build_opts.emplace("#define HAS_BIAS");
     }
 
-    int pad_x    = 0;
-    int pad_y    = 0;
     int stride_x = 0;
     int stride_y = 0;
-    std::tie(pad_x, pad_y)       = conv_info.pad();
+
     std::tie(stride_x, stride_y) = conv_info.stride();
+    _kernel_dims = std::make_pair(kernel_dims.width, kernel_dims.height);
 
     const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                     input->info()->tensor_shape().cend(),
                                                     output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
 
+    std::string kernel_name = "im2col_generic";
     if(!run_img2col_reduced)
     {
-        // this path is currently not used and not validated
-        build_opts.insert("#define IM2COL_GENERIC");
-        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                            kernel_dims.first, kernel_dims.second,
-                                            conv_info);
-        _num_elems_processed_per_iteration = output->info()->dimension(0);
+        if(input->info()->data_type() == DataType::F16 && _kernel_dims == std::pair<unsigned int, unsigned int>(1, 1))
+        {
+            build_opts.emplace("#define KERNEL_1x1");
+        }
 
-        build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.first));
-        build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.second));
+        build_opts.emplace("#define IM2COL_GENERIC");
+        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+                                            kernel_dims.width, kernel_dims.height,
+                                            conv_info);
+        _num_elems_processed_per_iteration = 2;
+
+        build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
+        build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.height));
         build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2)));
         build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first));
         build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second));
         build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first));
         build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second));
-        build_opts.emplace("#define PAD_X " + support::cpp11::to_string(conv_info.pad().first));
-        build_opts.emplace("#define PAD_Y " + support::cpp11::to_string(conv_info.pad().second));
+        build_opts.emplace("#define PAD_LEFT " + support::cpp11::to_string(conv_info.pad_left()));
+        build_opts.emplace("#define PAD_TOP " + support::cpp11::to_string(conv_info.pad_top()));
+        build_opts.emplace("#define PAD_RIGHT " + support::cpp11::to_string(conv_info.pad_right()));
+        build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
 
-        // Create kernel
-        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_generic", build_opts));
-
         _run_func = &GCIm2ColKernel::run_generic;
     }
     else
     {
-        build_opts.insert("#define IM2COL_REDUCED");
+        build_opts.emplace("#define IM2COL_REDUCED");
+        kernel_name = "im2col_reduced";
 
         if(input->info()->data_type() == DataType::F32)
         {
@@ -117,42 +143,47 @@
             int input_width  = input->info()->dimension(0);
             int input_height = input->info()->dimension(1);
 
-            build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
+            build_opts.emplace("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
             if(input_width % 8 == 0)
             {
                 _num_elems_processed_per_iteration = 8;
-                build_opts.insert("#define IM2COL_REDUCED_8X");
+                build_opts.emplace("#define IM2COL_REDUCED_8X");
             }
             else if(input_width % 4 == 0)
             {
                 _num_elems_processed_per_iteration = 4;
-                build_opts.insert("#define IM2COL_REDUCED_4X");
+                build_opts.emplace("#define IM2COL_REDUCED_4X");
             }
             else if(input_width % 2 == 0)
             {
                 _num_elems_processed_per_iteration = 2;
-                build_opts.insert("#define IM2COL_REDUCED_2X");
+                build_opts.emplace("#define IM2COL_REDUCED_2X");
             }
             else
             {
                 _num_elems_processed_per_iteration = 2;
-                build_opts.insert("#define IM2COL_REDUCED_GENERIC");
+                build_opts.emplace("#define IM2COL_REDUCED_GENERIC");
             }
         }
 
-        // Create kernel
-        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
-
         _run_func = &GCIm2ColKernel::run_reduced;
     }
 
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
 
     if(input->info()->data_type() == DataType::F16)
     {
         // Calculate input right and bottom border
-        AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+        const int input_width         = input->info()->dimension(0);
+        const int input_height        = input->info()->dimension(1);
+        int       input_total_width   = input->info()->padding().left + input_width + input->info()->padding().right;
+        int       input_padding_right = ceil_to_multiple(input_total_width, _num_elems_processed_per_iteration) - input_total_width;
+        input_total_width             = input_width + input_padding_right + input->info()->padding().right;
+        AccessWindowStatic input_access(input->info(), 0, 0, input_total_width, input_height);
 
         // Calculate output right and bottom border
         const int          output_width         = output->info()->dimension(0);
@@ -174,6 +205,15 @@
     IGCKernel::configure(win);
 }
 
+Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+    ARM_COMPUTE_UNUSED(kernel_dims);
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_UNUSED(has_bias);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    return Status{};
+}
+
 void GCIm2ColKernel::run(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
@@ -187,6 +227,7 @@
 
     // Get initial windows
     Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
+
     // Change the Z dimension's step back to 1
     window_collapsed.set_dimension_step(Window::DimZ, 1);
 
@@ -198,17 +239,18 @@
     slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
     slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
 
-    // Setup input slice
-    // The first three dimensions of the input are increased by the inner loops
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
     // Setup output slice
     slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
     slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
     slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
+    // we need top/left pad to be included in valid region
+    if(_input->info()->data_type() == DataType::F16)
+    {
+        (dynamic_cast<TensorInfo *>(_input->info()))->init(_input->info()->tensor_shape(), _input->info()->num_channels(), _input->info()->data_type(), _input->info()->strides_in_bytes(), 0,
+                                                           _input->info()->total_size(), _input->info()->fixed_point_position());
+    }
+
     _kernel.use();
 
     do
@@ -216,8 +258,6 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, 1, slice_in);
         add_2D_tensor_argument(idx, _output, 2, slice_out);
-
-        _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
         _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
         _kernel.set_argument(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
         _kernel.update_shader_params();

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
new file mode 100644
index 0000000..bc9c7eb
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp

@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+GCNormalizePlanarYUVLayerKernel::GCNormalizePlanarYUVLayerKernel()
+    : _input(nullptr), _output(nullptr), _mean(nullptr), _sd(nullptr)
+{
+}
+
+void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, sd);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, sd);
+    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+    _input  = input;
+    _output = output;
+    _mean   = mean;
+    _sd     = sd;
+
+    const unsigned int num_elems_processed_per_iteration = 4;
+
+    // Set build options
+    std::set<std::string> build_opts;
+    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
+    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalize_planar_yuv_layer", build_opts));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+    const int              mean_padding = ceil_to_multiple(mean->info()->dimension(0), num_elems_processed_per_iteration) - mean->info()->dimension(0);
+    const int              sd_padding   = ceil_to_multiple(sd->info()->dimension(0), num_elems_processed_per_iteration) - sd->info()->dimension(0);
+    AccessWindowStatic     mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + mean_padding, mean->info()->dimension(1));
+    AccessWindowStatic     sd_access(sd->info(), 0, 0, sd->info()->dimension(0) + sd_padding, sd->info()->dimension(1));
+
+    update_window_and_padding(win, input_access, output_access, mean_access, sd_access);
+    output_access.set_valid_region(win, input->info()->valid_region());
+
+    IGCKernel::configure(win);
+}
+
+void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    _kernel.use();
+
+    Window slice = window.first_slice_window_3D();
+
+    Window slice_in;
+    //slice_in.use_tensor_dimensions(_mean->info()->tensor_shape());
+    slice_in = window.first_slice_window_1D();
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor();
+    add_1D_tensor_argument(idx, _mean, 3, slice_in);
+    add_1D_tensor_argument(idx, _sd, 4, slice_in);
+
+    do
+    {
+        idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, slice);
+        add_3D_tensor_argument(idx, _output, 2, slice);
+
+        _kernel.update_shader_params();
+
+        enqueue(*this, slice);
+    }
+    while(window.slide_window_slice_3D(slice));
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index 0b6ba58..6451db7 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp

@@ -40,6 +40,176 @@
 
 using namespace arm_compute;
 
+namespace
+{
+// Internal window config info
+using GCPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
+
+void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
+{
+    TensorShape output_shape{ input->tensor_shape() };
+    output_shape.set(0, pooled_w);
+    output_shape.set(1, pooled_h);
+
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
+                                    "Unsupported combination of parameters!");
+
+    const bool         is_global_pooling = pool_info.is_global_pooling();
+    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
+                                    "Global pooling is supported only with rectangular inputs!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info().pad().first >= pool_size) || (pool_info.pad_stride_info().pad().second >= pool_size)),
+                                    "Invalid pool size and pool pad combination!");
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+        unsigned int pooled_w = 0;
+        unsigned int pooled_h = 0;
+        std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                         input->dimension(1),
+                                                         pool_size,
+                                                         pool_size,
+                                                         pool_info.pad_stride_info());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
+                                        "Invalid output pooling dimensions!");
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    int                 pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Update pool size in case of global pooling
+    pool_size = pool_info.is_global_pooling() ? input->dimension(0) : pool_size;
+
+    // Check output dimensions
+    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
+                                                     input->dimension(1),
+                                                     pool_size,
+                                                     pool_size,
+                                                     pad_stride_info);
+
+    auto_init(input, output, pooled_w, pooled_h);
+
+    BorderSize     border_size = BorderSize(pool_pad_y, pool_pad_x);
+    const DataType data_type   = input->data_type();
+
+    const int input_width  = input->dimension(0);
+    const int input_height = input->dimension(1);
+
+    unsigned int num_elems_processed_per_iteration = 1;
+
+    // Create kernel
+    if(pool_size == 3)
+    {
+        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
+        // each thread computes 4 output elements
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
+
+        int num_elems_read_per_iteration = pool_size;
+
+        if(input->data_type() == DataType::F32)
+        {
+            if(is_pool3x3_stride_le3)
+            {
+                // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+                num_elems_processed_per_iteration = 4;
+                num_elems_read_per_iteration      = pool_size * (pool_stride_x + 1);
+            }
+        }
+        else
+        {
+            if(is_pool3x3_stride_le3)
+            {
+                num_elems_processed_per_iteration = 4;
+            }
+            else
+            {
+                num_elems_processed_per_iteration = 2;
+            }
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    }
+    else // Run general case
+    {
+        if(input->data_type() == DataType::F32)
+        {
+            num_elems_processed_per_iteration = 1;
+        }
+        else
+        {
+            num_elems_processed_per_iteration = 2;
+        }
+
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+        border_size.right  = std::max(upper_bound_w, pool_pad_x);
+        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+    }
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    if(input->data_type() == DataType::F32)
+    {
+        AccessWindowStatic     input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+        bool                   window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_tuple(err, win, GCPoolingConfig(num_elems_processed_per_iteration, border_size));
+    }
+    else
+    {
+        // Calculate output right and bottom border
+        const int output_width          = output->dimension(0);
+        const int output_height         = output->dimension(1);
+        const int output_padding_right  = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width;
+        const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
+        const int input_padding_right   = ceil_to_multiple(input_width + 2 * border_size.right, num_elems_processed_per_iteration) - (input_width + 2 * border_size.right);
+        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * border_size.bottom, 1) - (input_height + 2 * border_size.bottom);
+
+        // Configure kernel window
+        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right + input_padding_right, input_height + border_size.bottom + input_padding_bottom);
+        AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
+        bool               window_changed = update_window_and_padding(win, input_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        return std::make_tuple(err, win, GCPoolingConfig(num_elems_processed_per_iteration, border_size));
+    }
+}
+} // namespace
+
 GCPoolingLayerKernel::GCPoolingLayerKernel()
     : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
 {
@@ -52,54 +222,41 @@
 
 void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info)
 {
-    int                 pool_pad_x        = 0;
-    int                 pool_pad_y        = 0;
-    int                 pool_stride_x     = 0;
-    int                 pool_stride_y     = 0;
-    unsigned int        pooled_w          = 0;
-    unsigned int        pooled_h          = 0;
-    const PoolingType   pool_type         = pool_info.pool_type();
-    int                 pool_size         = pool_info.pool_size();
-    const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
-    const bool          is_global_pooling = pool_info.is_global_pooling();
+    int                 pool_pad_x      = 0;
+    int                 pool_pad_y      = 0;
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    unsigned int        pooled_w        = 0;
+    unsigned int        pooled_h        = 0;
+    const PoolingType   pool_type       = pool_info.pool_type();
+    int                 pool_size       = pool_info.pool_size();
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+    const bool          exclude_padding = pool_info.exclude_padding();
     std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
-    ARM_COMPUTE_ERROR_ON(is_global_pooling && (input->info()->tensor_shape().x() != input->info()->tensor_shape().y()));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Update pool size in case of global pooling
-    pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
+    pool_size = pool_info.is_global_pooling() ? input->info()->dimension(0) : pool_size;
 
     // Check output dimensions
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
                                                      pool_size,
                                                      pool_size,
-                                                     pool_info.pad_stride_info());
+                                                     pad_stride_info);
 
-    // Output auto initialization if not yet initialized
-    {
-        TensorShape output_shape{ input->info()->tensor_shape() };
-        output_shape.set(0, pooled_w);
-        output_shape.set(1, pooled_h);
+    auto_init(input->info(), output->info(), pooled_w, pooled_h);
 
-        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
-
-    const int input_width  = input->info()->dimension(0);
-    const int input_height = input->info()->dimension(1);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
 
     // Set instance variables
-    _input       = input;
-    _output      = output;
-    _pool_info   = pool_info;
-    _border_size = BorderSize(pool_pad_y, pool_pad_x);
+    _input     = input;
+    _output    = output;
+    _pool_info = pool_info;
+
+    const DataType data_type = input->info()->data_type();
 
     // Set build options
     std::set<std::string> build_opts;
@@ -114,10 +271,14 @@
     {
         build_opts.insert("#define DATA_TYPE_FP16");
     }
+    if(exclude_padding)
+    {
+        build_opts.emplace("#define EXCLUDE_PADDING");
+    }
     build_opts.emplace(("#define POOL_" + string_from_pooling_type(pool_type)));
     build_opts.emplace(("#define STRIDE_X " + support::cpp11::to_string(pool_stride_x)));
-    build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x)));
-    build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y)));
+    build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))));
+    build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))));
     build_opts.emplace(("#define STRIDE_Y " + support::cpp11::to_string(pool_stride_y)));
     build_opts.emplace(("#define PAD_X " + support::cpp11::to_string(pool_pad_x)));
     build_opts.emplace(("#define PAD_Y " + support::cpp11::to_string(pool_pad_y)));
@@ -127,37 +288,7 @@
     {
         // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
         // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
-
-        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
-
-        if(input->info()->data_type() == DataType::F32)
-        {
-            if(is_pool3x3_stride_le3)
-            {
-                // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
-                _num_elems_processed_per_iteration = 4;
-                num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
-            }
-        }
-        else
-        {
-            num_elements_read_per_iteration = pool_size;
-            if(is_pool3x3_stride_le3)
-            {
-                _num_elems_processed_per_iteration = 4;
-            }
-            else
-            {
-                _num_elems_processed_per_iteration = 2;
-            }
-        }
-
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
         std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
         if(is_pool3x3_stride_le3)
@@ -173,53 +304,27 @@
     }
     else // Run general case
     {
-        if(input->info()->data_type() == DataType::F32)
-        {
-            _num_elems_processed_per_iteration = 1;
-        }
-        else
-        {
-            _num_elems_processed_per_iteration = 2;
-        }
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        _border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-
         build_opts.emplace(("#define POOL_SIZE " + support::cpp11::to_string(pool_size)));
 
         build_opts.insert("#define POOLING_LAYER_N");
         _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pooling_layer_n", build_opts));
     }
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
-    Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+    IGCKernel::configure(std::get<1>(win_config));
+    GCPoolingConfig pooling_config     = std::get<2>(win_config);
+    _num_elems_processed_per_iteration = pooling_config.first;
+    _border_size                       = pooling_config.second;
+}
 
-    if(input->info()->data_type() == DataType::F32)
-    {
-        AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
-        AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    }
-    else
-    {
-        // Calculate output right and bottom border
-        const int output_width          = output->info()->dimension(0);
-        const int output_height         = output->info()->dimension(1);
-        const int output_padding_right  = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
-        const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
-        const int input_padding_right   = ceil_to_multiple(input_width + 2 * _border_size.right, _num_elems_processed_per_iteration) - (input_width + 2 * _border_size.right);
-        const int input_padding_bottom  = ceil_to_multiple(input_height + 2 * _border_size.bottom, 1) - (input_height + 2 * _border_size.bottom);
+Status GCPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
 
-        // Configure kernel window
-        AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right + input_padding_right, input_height + _border_size.bottom + input_padding_bottom);
-        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
-        update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    }
-
-    IGCKernel::configure(win);
+    return Status{};
 }
 
 void GCPoolingLayerKernel::run(const Window &window)
@@ -239,7 +344,7 @@
     do
     {
         // Upsample input by pool size
-        Window in_slice(slice);
+        Window in_slice(slice); // NOLINT
         in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
         in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
 

diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
new file mode 100644
index 0000000..f307cfb
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp

@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize GCScaleKernel::border_size() const
+{
+    return BorderSize(1);
+}
+
+void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_ON(output == input);
+    ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
+    ARM_COMPUTE_UNUSED(sampling_policy);
+
+    _input  = input;
+    _output = output;
+
+    // Compute the ratio between source width/height and destination width/height
+    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+    // Compute actual border size
+    BorderSize border = border_undefined ? BorderSize(0) : border_size();
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    {
+        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+    }
+
+    // Create kernel
+    std::set<std::string> build_opts;
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+
+    build_opts.emplace("#define DATA_TYPE_FP16");
+    build_opts.emplace("#define BORDER_SIZE " + support::cpp11::to_string(border.right));
+
+    // Configure kernel window
+    unsigned int num_elems_processed_per_iteration = 4;
+    unsigned int input_width_alignment             = 2;
+
+    // performance optimization for 2x upscaling with no border
+    if((fabs(wr - 0.5) < 1e-6) && (fabs(hr - 0.5) < 1e-6) && border_undefined)
+    {
+        num_elems_processed_per_iteration = 8;
+        input_width_alignment             = 4;
+        build_opts.emplace("#define SCALE_NEAREST_8X");
+    }
+    else
+    {
+        build_opts.emplace("#define SCALE_NEAREST_GENERIC");
+    }
+
+    std::string interpolation_name = string_from_interpolation_policy(policy); // NOLINT
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "scale_" + interpolation_name;
+    _kernel                 = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
+
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+    const ValidRegion &input_valid_region = input->info()->valid_region();
+
+    const int total_width   = border.left + input_valid_region.anchor[0] + input_valid_region.shape[0] + border.right;
+    const int padding_right = ceil_to_multiple(total_width, input_width_alignment) - border.left - input_valid_region.anchor[0] - input_valid_region.shape[0];
+
+    // Reads can occur within the valid region of the input
+    AccessWindowStatic input_access(input->info(),
+                                    input_valid_region.anchor[0] - border.left, input_valid_region.anchor[1] - border.top,
+                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + padding_right,
+                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border.bottom);
+
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
+                                                                     output->info()->tensor_shape(),
+                                                                     policy,
+                                                                     border,
+                                                                     border_undefined));
+
+    IGCKernel::configure(win);
+
+    unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the tensor parameters
+    _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(0)));
+    _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(1)));
+    _kernel.set_argument<float>(idx++, wr);
+    _kernel.set_argument<float>(idx++, hr);
+}

diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index 621c969..bda08e4 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
 
-#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
 #include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
@@ -57,12 +57,22 @@
     _input  = input;
     _output = output;
 
+    // for better performance
+    if(w_out < 512 && h_out < 512)
+    {
+        _lws_hint = gles::NDRange(8U, 1U, 1U);
+    }
+    else
+    {
+        _lws_hint = gles::NDRange(1U, 8U, 1U);
+    }
+
     std::set<std::string> build_opts;
     std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
     build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws_hint[0]));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws_hint[1]));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws_hint[2]));
 
     // Configure kernel window
     unsigned int num_elems_processed_per_iteration = 4;
@@ -91,13 +101,21 @@
     // Create kernel
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
 
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+    const unsigned int width_aligned  = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[0]);
+    const unsigned int height_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[1]);
 
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
-    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+    AccessWindowStatic input_access(input->info(), 0, 0,
+                                    ceil_to_multiple(input->info()->dimension(0), width_aligned),
+                                    ceil_to_multiple(input->info()->dimension(1), height_aligned));
+    AccessWindowStatic output_access(output->info(), 0, 0,
+                                     ceil_to_multiple(output->info()->dimension(0), height_aligned),
+                                     ceil_to_multiple(output->info()->dimension(1), width_aligned));
+
+    Window win = calculate_max_window(*input->info(), Steps(width_aligned, height_aligned));
+    win.set_dimension_step(Window::DimX, num_elems_processed_per_iteration);
+    win.set_dimension_step(Window::DimY, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
+    output_access.set_valid_region(win, output->info()->valid_region());
 
     IGCKernel::configure(win);
 }
@@ -114,28 +132,11 @@
     do
     {
         unsigned int idx = 0;
-        if(_input->info()->data_type() == DataType::F32)
-        {
-            add_2D_tensor_argument(idx, _input, 1, slice);
-            add_2D_tensor_argument(idx, _output, 2, slice);
-        }
-        else if(_input->info()->data_type() == DataType::F16)
-        {
-#if defined(TRANSPOSE_4X4)
-            BufferParam param = { 1, 3 };
-            add_2D_tensor_argument(idx, _input, param, slice);
-            param.binding_point = 2;
-            add_2D_tensor_argument(idx, _output, param, slice);
-#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
-            BufferParam param = { 1, 4 };
-            add_2D_tensor_argument(idx, _input, param, slice);
-            param.binding_point = 2;
-            add_2D_tensor_argument(idx, _output, param, slice);
-#endif                       /* TRANSPOSE_4X4 */
-        }
 
+        add_2D_tensor_argument(idx, _input, 1, slice);
+        add_2D_tensor_argument(idx, _output, 2, slice);
         _kernel.update_shader_params();
-        enqueue(*this, slice);
+        enqueue(*this, slice, _lws_hint);
     }
     while(window.slide_window_slice_2D(slice));
 }

diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
new file mode 100644
index 0000000..4c08873
--- /dev/null
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp

@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
+
+using namespace arm_compute;
+using namespace arm_compute::gles_compute;
+
+GCWeightsReshapeKernel::GCWeightsReshapeKernel()
+    : _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void GCWeightsReshapeKernel::configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+    // Calculate output shape
+    TensorShape output_shape{ input->info()->tensor_shape() };
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0));
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
+        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
+    }
+
+    _biases = biases;
+    _output = output;
+    _input  = input;
+
+    // Create build options
+    std::set<std::string> build_opts;
+    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
+    build_opts.emplace("#define " + dt_name);
+    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
+    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
+    build_opts.emplace("#define RESHAPE_TO_COLUMNS");
+    if(biases != nullptr)
+    {
+        build_opts.emplace("#define HAS_BIAS");
+    }
+
+    // Create kernel
+    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+    // Set static arguments
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.set_argument(idx++, _input->info()->dimension(0));
+    _kernel.set_argument(idx++, _input->info()->dimension(1));
+    _kernel.set_argument(idx++, _input->info()->dimension(2));
+    _kernel.set_argument(idx++, _input->info()->dimension(3));
+
+    // Configure window
+    Window win = calculate_max_window(*input->info(), Steps());
+
+    // The GCWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    IGCKernel::configure(win);
+}
+
+void GCWeightsReshapeKernel::run(const Window &window)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
+
+    Window out_window;
+    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    if(_biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    _kernel.use();
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, _input, 1, in_slice);
+        add_2D_tensor_argument(idx, _output, 2, out_slice);
+        if(_biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, _biases, 3, biases_slice);
+            biases_window.slide_window_slice_1D(biases_slice);
+        }
+
+        _kernel.update_shader_params();
+        // Run kernel
+        enqueue(*this, in_slice);
+    }
+    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}

diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index 1b6175e..73f4c42 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,10 +66,20 @@
 
 Size2D HOGInfo::num_cells_per_block() const
 {
+    ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
+
     return Size2D(_block_size.width / _cell_size.width,
                   _block_size.height / _cell_size.height);
 }
 
+Size2D HOGInfo::num_cells_per_block_stride() const
+{
+    ARM_COMPUTE_ERROR_ON(_cell_size.width == 0 || _cell_size.height == 0);
+
+    return Size2D(_block_stride.width / _cell_size.width,
+                  _block_stride.height / _cell_size.height);
+}
+
 Size2D HOGInfo::num_blocks_per_image(const Size2D &image_size) const
 {
     return Size2D(((image_size.width - _block_size.width) / _block_stride.width) + 1,

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 151d7de..3ee0fa7 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,17 @@
  */
 #include "arm_compute/core/Helpers.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/IKernel.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Utils.h"
-
-#include <algorithm>
-#include <cstdint>
-
 using namespace arm_compute;
 
-Window arm_compute::calculate_max_window(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(!skip_border)
     {
         border_size = BorderSize(0);
     }
 
-    const Coordinates &anchor = info.valid_region().anchor;
-    const TensorShape &shape  = info.valid_region().shape;
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
 
     Window window;
 
@@ -53,10 +45,9 @@
                    anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
                    steps[0]));
 
-    size_t             n            = 1;
-    const TensorShape &tensor_shape = info.tensor_shape();
+    size_t n = 1;
 
-    if(tensor_shape.num_dimensions() > 1)
+    if(anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
                        // Skip the border above the image
@@ -68,18 +59,23 @@
         ++n;
     }
 
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
     for(; n < Coordinates::num_max_dimensions; ++n)
     {
-        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+        window.set(n, Window::Dimension(0, 1));
     }
 
     return window;
 }
 
-Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps, BorderSize border_size)
+Window arm_compute::calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
 {
-    const Coordinates &anchor = info.valid_region().anchor;
-    const TensorShape &shape  = info.valid_region().shape;
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
 
     Window window;
 
@@ -91,10 +87,9 @@
                    anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
                    steps[0]));
 
-    size_t             n            = 1;
-    const TensorShape &tensor_shape = info.tensor_shape();
+    size_t n = 1;
 
-    if(tensor_shape.num_dimensions() > 1)
+    if(anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
                        // Include the border above the image
@@ -106,22 +101,27 @@
         ++n;
     }
 
-    if(tensor_shape.num_dimensions() > 2)
+    if(anchor.num_dimensions() > 2)
     {
-        window.set(2, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n]), steps[2]));
+        window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
 
         ++n;
     }
 
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
     for(; n < Coordinates::num_max_dimensions; ++n)
     {
-        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+        window.set(n, Window::Dimension(0, 1));
     }
 
     return window;
 }
 
-Window arm_compute::calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps, bool skip_border, BorderSize border_size)
+Window arm_compute::calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
     if(skip_border)
     {
@@ -134,8 +134,8 @@
         border_size.right = 0;
     }
 
-    const Coordinates &anchor = info.valid_region().anchor;
-    const TensorShape &shape  = info.valid_region().shape;
+    const Coordinates &anchor = valid_region.anchor;
+    const TensorShape &shape  = valid_region.shape;
 
     Window window;
 
@@ -147,10 +147,9 @@
                    anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
                    steps[0]));
 
-    size_t             n            = 1;
-    const TensorShape &tensor_shape = info.tensor_shape();
+    size_t n = 1;
 
-    if(tensor_shape.num_dimensions() > 1)
+    if(anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
                        // Skip the border above the image
@@ -162,9 +161,14 @@
         ++n;
     }
 
+    for(; n < anchor.num_dimensions(); ++n)
+    {
+        window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
+    }
+
     for(; n < Coordinates::num_max_dimensions; ++n)
     {
-        window.set(n, Window::Dimension(0, std::max<size_t>(1, tensor_shape[n])));
+        window.set(n, Window::Dimension(0, 1));
     }
 
     return window;

diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 4d76dfe..a2b24de 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp

@@ -122,6 +122,14 @@
             break;
     }
 
+    TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]);
+    TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() };
+    subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]);
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2);
+
     Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
 
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
@@ -248,14 +256,14 @@
     const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
 
     Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
-    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[0], 1.f / _y_subsampling[0]);
-    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, y_step, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
+    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
     AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
 
     update_window_and_padding(win,
-                              AccessWindowRectangle(plane0->info(), 0, 0, _num_elems_processed_per_iteration, y_step),
-                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
-                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, y_step, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
+                              AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
+                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
+                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
                               output_plane0_access,
                               output_plane1_access,
                               output_plane2_access);
@@ -350,7 +358,7 @@
 {
     // Create sub-sampled uv window and init uv planes
     Window win_uv(win);
-    win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
+    win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]);
     win_uv.validate();
 
     Iterator p0(_planes[0], win);
@@ -397,13 +405,13 @@
 
     // Update UV window
     Window uv_win(win);
-    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
+    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration));
     uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
     uv_win.validate();
 
     // Update output win
     Window out_win(win);
-    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
+    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2));
     out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
     out_win.validate();
 
@@ -413,9 +421,6 @@
     Iterator  p2(_planes[2 - shift], uv_win);
     Iterator  out(_output_multi->plane(1), out_win);
 
-    // Increase step size after iterator is created to calculate stride correctly for multi channel format
-    out_win.set_dimension_step(Window::DimX, out_win.x().step() * 2);
-
     execute_window_loop(out_win, [&](const Coordinates & id)
     {
         const uint8x8x2_t pixels =
@@ -445,17 +450,19 @@
 
     // Update window
     Window tmp_win(win);
-    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
+    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration));
     tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
+    tmp_win.validate();
 
     Iterator in(_planes[plane_id], tmp_win);
     Iterator out(_output_multi->plane(plane_id), tmp_win);
 
     execute_window_loop(tmp_win, [&](const Coordinates & id)
     {
-        const uint8x8_t pixels = vld1_u8(in.ptr());
+        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
+        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
 
-        vst1_u8(out.ptr(), pixels);
+        vst1_u8(out_ptr, vld1_u8(in_ptr));
     },
     in, out);
 }

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index 263fbe0..7468f58 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -627,16 +627,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_output->info()->format())
+    switch(_output->info()->data_type())
     {
-        case Format::U8:
+        case DataType::U8:
             convolution<uint8_t>(window);
             break;
-        case Format::S16:
+        case DataType::S16:
             convolution<int16_t>(window);
             break;
         default:
-            ARM_COMPUTE_ERROR("Not supported");
+            ARM_COMPUTE_ERROR("Not supported Data type!");
+            break;
     }
 }
 
@@ -1521,13 +1522,13 @@
     };
 
     // Run appropriate function
-    switch(_output->info()->format())
+    switch(_output->info()->data_type())
     {
-        case Format::U8:
+        case DataType::U8:
             ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size());
             (this->*func_table_u8[_func_idx])(window);
             break;
-        case Format::S16:
+        case DataType::S16:
             ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size());
             (this->*func_table_s16[_func_idx])(window);
             break;

diff --git a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp b/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp
deleted file mode 100644
index 71db2e9..0000000
--- a/src/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.cpp
+++ /dev/null

@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-NEDeconvolutionLayerUpsampleKernel::NEDeconvolutionLayerUpsampleKernel()
-    : _offsets(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize NEDeconvolutionLayerUpsampleKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEDeconvolutionLayerUpsampleKernel::configure(const ITensor *input, const ITensor *offsets, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0);
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input   = input;
-    _output  = output;
-    _offsets = offsets;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    const int              border_offset                     = border_size().left;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle  input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
-    AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, offsets_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEDeconvolutionLayerUpsampleKernel::scale_nearest(const Window &window)
-{
-    const size_t input_stride = _input->info()->strides_in_bytes()[1];
-
-    // Compute the ratio between source height and destination height
-    const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Window win_off;
-    win_off.set(Window::DimX, window[Window::DimX]);
-    win_off.set(Window::DimY, window[Window::DimY]);
-
-    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
-    {
-        win_off.set(d, Window::Dimension(0, 0, 0));
-    }
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-    Iterator offsets(_offsets, win_off);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            float32x4x4_t tmp =
-            {
-                {
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0)
-                }
-            };
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const size_t in_yi      = (id.y() + 0.5f) * hr;
-                const size_t offset_row = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
-
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
-
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
-
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
-
-                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-}
-
-void NEDeconvolutionLayerUpsampleKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    scale_nearest(window);
-}

diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 01b0f10..891a03c 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -154,7 +154,7 @@
     AccessWindowRectangle  input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index dd5c448..bc2f1ed 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,70 +36,24 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
 using namespace arm_compute::detail;
+using namespace arm_compute::misc::shape_calculator;
 
-NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _conv_info()
+namespace
 {
-}
-
-BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    std::pair<unsigned int, unsigned int> expected_output = scaled_dimensions(input->info()->tensor_shape().x(), input->info()->tensor_shape().y(),
-                                                                              weights->info()->tensor_shape().x(), weights->info()->tensor_shape().y(),
-                                                                              conv_info);
-
-    ARM_COMPUTE_ERROR_ON(expected_output.first != output->info()->tensor_shape().x());
-    ARM_COMPUTE_ERROR_ON(expected_output.second != output->info()->tensor_shape().y());
-
-    _input                           = input;
-    _output                          = output;
-    _weights                         = weights;
-    _conv_info                       = conv_info;
-    const unsigned int conv_stride_x = conv_info.stride().first;
-    const unsigned int conv_stride_y = conv_info.stride().second;
-    const unsigned int conv_pad_x    = conv_info.pad().first;
-    const unsigned int conv_pad_y    = conv_info.pad().second;
-
-    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
-
-    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-    _border_size                                       = BorderSize(conv_pad_y, conv_pad_x);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_written_per_iteration));
-
-    const unsigned int num_x_steps               = (expected_output.first + num_elems_written_per_iteration - 1) / num_elems_written_per_iteration;
-    const int          input_num_elems_processed = get_input_num_elems_processed(num_elems_written_per_iteration, conv_stride_x);
-
-    AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, (num_x_steps - 1) * input_num_elems_processed + 12, conv_stride_y * (expected_output.second - 1) + 2);
-    AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * num_elems_written_per_iteration, expected_output.second);
-
-    update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-template <unsigned int stridex>
+template <typename T1, typename T2, unsigned int stridex>
 class convolver_3x3
 {
 public:
     static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
                          const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
     {
+        const int input_offset   = -input->info()->quantization_info().offset;
+        const int weights_offset = -weights->info()->quantization_info().offset;
+
         const int          input_stride_x  = input->info()->strides_in_bytes().x();
         const int          input_stride_y  = input->info()->strides_in_bytes().y();
         const int          output_stride_y = output->info()->strides_in_bytes().y();
@@ -109,8 +63,8 @@
         const int          output_h        = output->info()->dimension(1);
         const int          delta_input     = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
         const unsigned int conv_stride_y   = std::get<1>(conv_info.stride());
-        const unsigned int conv_pad_x      = std::get<0>(conv_info.pad());
-        const unsigned int conv_pad_y      = std::get<1>(conv_info.pad());
+        const unsigned int conv_pad_x      = conv_info.pad_left();
+        const unsigned int conv_pad_y      = conv_info.pad_top();
 
         // setup output window for the iterator
         Window window_out = window;
@@ -133,29 +87,31 @@
 
         execute_window_loop(window_out, [&](const Coordinates & id)
         {
-            const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
-            int            ih        = 0;
-            int            oh        = 0;
+            int ih = 0;
+            int oh = 0;
 
-            const uint8_t      *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
-            const auto          ptr_weights_r0   = reinterpret_cast<const float *>(ptr_weights_base);
-            const auto          ptr_weights_r1   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y);
-            const auto          ptr_weights_r2   = reinterpret_cast<const float *>(ptr_weights_base + kernel_stride_y * 2);
-            const float32x4x3_t vw_r0            = load_matrix_row(ptr_weights_r0);
-            const float32x4x3_t vw_r1            = load_matrix_row(ptr_weights_r1);
-            const float32x4x3_t vw_r2            = load_matrix_row(ptr_weights_r2);
+            const uint8_t *input_ptr        = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;
+            const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
+
+            const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
+            const auto ptr_weights_r1 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y);
+            const auto ptr_weights_r2 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y * 2);
+            const auto vw_r0          = load_matrix_row(ptr_weights_r0, weights_offset);
+            const auto vw_r1          = load_matrix_row(ptr_weights_r1, weights_offset);
+            const auto vw_r2          = load_matrix_row(ptr_weights_r2, weights_offset);
 
             for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
             {
-                auto in_top = reinterpret_cast<const float *>(input_ptr + (ih + 0) * input_stride_y);
-                auto in_mid = reinterpret_cast<const float *>(input_ptr + (ih + 1) * input_stride_y);
-                auto in_low = reinterpret_cast<const float *>(input_ptr + (ih + 2) * input_stride_y);
-                auto p_out  = reinterpret_cast<float *>(out.ptr() + oh * output_stride_y);
+                auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
+                auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + 1) * input_stride_y);
+                auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2) * input_stride_y);
+                auto p_out  = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y);
 
                 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
-                    in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
+                    in_top += delta_input, in_mid += delta_input, in_low += delta_input,
+                    p_out += num_elems_written_per_iteration)
                 {
-                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0);
+                    auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0, input_offset);
                     store_results<stridex>(p_out, vres);
                 }
             }
@@ -164,24 +120,113 @@
     }
 };
 
+template <typename T1, typename T2>
+inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
+                         const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    switch(conv_stride_x)
+    {
+        case 1:
+            convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 2:
+            convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        case 3:
+            convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace
+
+NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
+    : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0)
+{
+}
+
+BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
+{
+    return _border_size;
+}
+
+void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
+
+    // Get convolved dimensions
+    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info);
+    const DataType    output_dt    = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+    _input                           = input;
+    _output                          = output;
+    _weights                         = weights;
+    _conv_info                       = conv_info;
+    const unsigned int conv_stride_x = conv_info.stride().first;
+    const unsigned int conv_stride_y = conv_info.stride().second;
+    const unsigned int conv_pad_left = conv_info.pad_left();
+    const unsigned int conv_pad_top  = conv_info.pad_top();
+
+    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
+
+    unsigned int num_elems_read_per_iteration = 0;
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            num_elems_read_per_iteration     = 16;
+            _num_elems_written_per_iteration = 16 >> conv_stride_x;
+            break;
+        case DataType::F32:
+            num_elems_read_per_iteration     = 12;
+            _num_elems_written_per_iteration = 16 >> conv_stride_x;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported.");
+    }
+    _border_size = BorderSize(conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), conv_pad_left);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+
+    const unsigned int num_x_steps               = (output_shape.x() + _num_elems_written_per_iteration - 1) / _num_elems_written_per_iteration;
+    const int          input_num_elems_processed = get_input_num_elems_processed(_num_elems_written_per_iteration, conv_stride_x);
+
+    AccessWindowStatic input_access(input->info(),
+                                    -conv_pad_left,
+                                    -conv_pad_top,
+                                    (num_x_steps - 1) * input_num_elems_processed + num_elems_read_per_iteration,
+                                    conv_stride_y * (output_shape.y() - 1) + 2);
+    AccessWindowStatic weights_access(weights->info(), 0, 0, weights->info()->dimension(0), weights->info()->dimension(1));
+    AccessWindowStatic output_access(output->info(), 0, 0, num_x_steps * _num_elems_written_per_iteration, output_shape.y());
+
+    update_window_and_padding(win, input_access, weights_access, output_access);
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+    INEKernel::configure(win);
+}
+
 void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_UNUSED(info);
 
-    const unsigned int conv_stride_x                   = _conv_info.stride().first;
-    const unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
-
-    switch(conv_stride_x)
+    switch(_input->info()->data_type())
     {
-        case 1:
-            convolver_3x3<1>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+        case DataType::F32:
+            convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
             break;
-        case 2:
-            convolver_3x3<2>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
-            break;
-        case 3:
-            convolver_3x3<3>::convolve(window, num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+        case DataType::QASYMM8:
+            convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
             break;
         default:
             ARM_COMPUTE_ERROR("Not implemented");

diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index a5680eb..06e6b03 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp

@@ -80,9 +80,11 @@
 
     AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration);
+
+    AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
     AccessWindowRectangle  in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
-    AccessWindowRectangle  in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_rows_read_per_iteration);
+
+    AccessWindowRectangle in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_rows_read_per_iteration);
 
     if(run_der_x && run_der_y)
     {

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
deleted file mode 100644
index 65b7087..0000000
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ /dev/null

@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
-    if(is_data_type_quantized(input->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
-
-    // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(bias, output);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
-{
-    bool               window_changed                    = false;
-    const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
-    if(output != nullptr && (output->total_size() != 0))
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    else
-    {
-        window_changed = update_window_and_padding(win, input_access, bias_access);
-        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-// Internal load
-inline float32x4_t internal_vld1q(const float *in)
-{
-    return vld1q_f32(in);
-}
-inline qint8x16_t internal_vld1q(const qint8_t *in)
-{
-    return vld1q_qs8(in);
-}
-inline qint16x8_t internal_vld1q(const qint16_t *in)
-{
-    return vld1q_qs16(in);
-}
-
-inline qint32x4_t internal_vld1q(const qint32_t *in)
-{
-    return vld1q_s32(in);
-}
-
-// Internal store
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
-    vst1q_f32(p, v);
-}
-inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
-{
-    vst1q_qs8(p, v);
-}
-inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
-{
-    vst1_qs8(p, vqmovn_s16(v));
-}
-inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
-{
-    vst1q_qs16(p, v);
-}
-
-inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
-{
-    vst1q_s32(p, v);
-}
-
-inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
-{
-    vst1_qs16(p, vqmovn_qs32(v));
-}
-
-// Internal vdup
-inline float32x4_t internal_vdupq_n(float v)
-{
-    return vdupq_n_f32(v);
-}
-inline qint8x16_t internal_vdupq_n(qint8_t v)
-{
-    return vdupq_n_qs8(v);
-}
-inline qint16x8_t internal_vdupq_n(qint16_t v)
-{
-    return vdupq_n_qs16(v);
-}
-
-inline qint32x4_t internal_vdupq_n(qint32_t v)
-{
-    return vdupq_n_qs32(v);
-}
-
-// Internal vadd
-inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
-{
-    return vaddq_f32(x, y);
-}
-inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
-{
-    return vqaddq_qs8(x, y);
-}
-inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
-{
-    return vqaddq_qs16(x, y);
-}
-inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
-{
-    return vqaddq_qs32(x, y);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16x8_t internal_vld1q(const float16_t *in)
-{
-    return vld1q_f16(in);
-}
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
-    vst1q_f16(p, v);
-}
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
-    return vdupq_n_f16(v);
-}
-inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
-{
-    return vaddq_f16(x, y);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T1, typename T2, bool in_place>
-void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
-{
-    Iterator in(input, window);
-
-    if(in_place) // In place accumulate
-    {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
-            const auto vb     = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
-
-            // Accumulate bias
-            internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
-        },
-        in);
-    }
-    else // Out of place accumulate
-    {
-        Iterator out(output, window);
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr  = reinterpret_cast<const T1 *>(in.ptr());
-            const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
-            const auto vb      = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
-
-            // Accumulate bias
-            internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
-        },
-        in, out);
-    }
-}
-} // namespace
-
-NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel()
-    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, bias);
-
-    // Auto-initialize output output if required
-    if(output != nullptr)
-    {
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*output->info(), *input->info());
-    }
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info()));
-
-    _func   = nullptr;
-    _bias   = bias;
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), bias->info(), (output == nullptr) ? nullptr : output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-
-    // Set appropriate function
-    switch(input->info()->data_type())
-    {
-        case DataType::QS8:
-        {
-            _func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
-            break;
-        }
-        case DataType::QS16:
-        {
-            if(bias->info()->data_type() == DataType::QS8)
-            {
-                _func = (output == nullptr) ? &accumulate_bias<qint16_t, qint8_t, true> : &accumulate_bias<qint16_t, qint8_t, false>;
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Not implemented");
-            }
-            break;
-        }
-        case DataType::QS32:
-        {
-            _func = (output == nullptr) ? &accumulate_bias<qint32_t, qint16_t, true> : &accumulate_bias<qint32_t, qint16_t, false>;
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            _func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            break;
-        }
-    }
-}
-
-Status NEDirectConvolutionLayerBiasAccumulateKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
-
-    return Status{};
-}
-
-void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input, _bias, window, _output);
-}

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 2ba0ef2..cb8246d 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1052,8 +1052,6 @@
 {
     // Calculate right and bottom border
     unsigned int       kernel_size   = weights->dimension(0);
-    const unsigned int conv_pad_x    = std::get<0>(conv_info.pad());
-    const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
     const int          input_width   = input->dimension(0);
@@ -1122,12 +1120,22 @@
         }
     }
 
-    const int upper_bound_w    = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_pad_x - input_width;
-    const int upper_bound_h    = ((output->dimension(1) - 1) * conv_stride_y - conv_pad_y + kernel_size) - input_height;
-    border_size.right          = std::max(upper_bound_w, static_cast<int>(kernel_size));
-    border_size.bottom         = std::max(upper_bound_h, static_cast<int>(kernel_size));
+    // Calculate border
+    int upper_bound_w = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_info.pad_left() - conv_info.pad_right() - input_width;
+    int upper_bound_h = ((output->dimension(1) - 1) * conv_stride_y - conv_info.pad_top() - conv_info.pad_bottom() + kernel_size) - input_height;
+
+    const unsigned int conv_pad_left   = std::max(upper_bound_w - static_cast<int>(conv_info.pad_right()), static_cast<int>(kernel_size) / 2);
+    const unsigned int conv_pad_top    = std::max(upper_bound_h - static_cast<int>(conv_info.pad_bottom()), static_cast<int>(kernel_size) / 2);
+    const unsigned int conv_pad_right  = std::max(upper_bound_w - static_cast<int>(conv_info.pad_left()), static_cast<int>(kernel_size) / 2);
+    const unsigned int conv_pad_bottom = std::max(upper_bound_h - static_cast<int>(conv_info.pad_top()), static_cast<int>(kernel_size) / 2);
+
+    border_size.right  = conv_pad_right;
+    border_size.bottom = conv_pad_bottom;
+    border_size.left   = conv_pad_left;
+    border_size.top    = conv_pad_top;
+
     Window                 win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
-    AccessWindowStatic     input_access(input, -conv_pad_x, -conv_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowStatic     input_access(input, -conv_pad_left, -conv_pad_top, input_width + conv_pad_right, input_height + conv_pad_bottom);
     AccessWindowStatic     weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
     AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
@@ -1152,15 +1160,18 @@
 void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    const unsigned int conv_pad_x = std::get<0>(conv_info.pad());
-    const unsigned int conv_pad_y = std::get<1>(conv_info.pad());
 
     _input       = input;
     _weights     = weights;
     _output      = output;
     _conv_info   = conv_info;
     _kernel_size = weights->info()->dimension(0);
-    _border_size = BorderSize(conv_pad_y, conv_pad_x);
+
+    const unsigned int conv_pad_left   = conv_info.pad_left();
+    const unsigned int conv_pad_top    = conv_info.pad_top();
+    const unsigned int conv_pad_right  = conv_info.pad_right();
+    const unsigned int conv_pad_bottom = conv_info.pad_bottom();
+    _border_size                       = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
 
     // Get convolved dimensions
     TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info);

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
new file mode 100644
index 0000000..52880a3
--- /dev/null
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
+                                                         DataType::QS16, DataType::F16,
+                                                         DataType::QS32, DataType::S32, DataType::F32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
+
+        if(is_data_type_fixed_point(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_data_type_quantized(input->data_type()), "Calling output stage kernel with floating point arguments");
+    }
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+        if(is_data_type_fixed_point(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+        }
+        else if(is_data_type_quantized_asymmetric(output->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        }
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    bool         window_changed                    = false;
+    unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
+
+    // Update processed elements when input is S32 (comes from quantization input)
+    if(input->data_type() == DataType::S32)
+    {
+        num_elems_processed_per_iteration = 16;
+    }
+
+    // Configure kernel window
+    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+
+    if(output != nullptr && (output->total_size() != 0))
+    {
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+        if(bias == nullptr)
+        {
+            window_changed = update_window_and_padding(win, input_access, output_access);
+        }
+        else
+        {
+            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
+        }
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+    else
+    {
+        if(bias == nullptr)
+        {
+            window_changed = update_window_and_padding(win, input_access);
+        }
+        else
+        {
+            AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
+            window_changed = update_window_and_padding(win, input_access, bias_access);
+        }
+
+        input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+// Internal load
+inline float32x4_t internal_vld1q(const float *in)
+{
+    return vld1q_f32(in);
+}
+inline qint8x16_t internal_vld1q(const qint8_t *in)
+{
+    return vld1q_qs8(in);
+}
+inline qint16x8_t internal_vld1q(const qint16_t *in)
+{
+    return vld1q_qs16(in);
+}
+inline qint32x4_t internal_vld1q(const qint32_t *in)
+{
+    return vld1q_s32(in);
+}
+
+// Internal store
+inline void internal_vst1q(float *p, const float32x4_t &v)
+{
+    vst1q_f32(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
+{
+    vst1q_qs8(p, v);
+}
+inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
+{
+    vst1_qs8(p, vqmovn_s16(v));
+}
+inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
+{
+    vst1q_qs16(p, v);
+}
+inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
+{
+    vst1q_s32(p, v);
+}
+
+inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
+{
+    vst1_qs16(p, vqmovn_qs32(v));
+}
+
+// Internal vdup
+inline float32x4_t internal_vdupq_n(float v)
+{
+    return vdupq_n_f32(v);
+}
+inline qint8x16_t internal_vdupq_n(qint8_t v)
+{
+    return vdupq_n_qs8(v);
+}
+inline qint16x8_t internal_vdupq_n(qint16_t v)
+{
+    return vdupq_n_qs16(v);
+}
+inline qint32x4_t internal_vdupq_n(qint32_t v)
+{
+    return vdupq_n_qs32(v);
+}
+
+// Internal vadd
+inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
+{
+    return vaddq_f32(x, y);
+}
+inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
+{
+    return vqaddq_qs8(x, y);
+}
+inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
+{
+    return vqaddq_qs16(x, y);
+}
+inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
+{
+    return vqaddq_qs32(x, y);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16x8_t internal_vld1q(const float16_t *in)
+{
+    return vld1q_f16(in);
+}
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+    vst1q_f16(p, v);
+}
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+    return vdupq_n_f16(v);
+}
+inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
+{
+    return vaddq_f16(x, y);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T1, typename T2, bool in_place, bool has_bias>
+void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Iterator in(input, window);
+
+    if(in_place) // In place accumulate
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<T1 *>(in.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+                internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+            }
+            else
+            {
+                internal_vst1q(in_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in);
+    }
+    else // Out of place accumulate
+    {
+        Iterator out(output, window);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr  = reinterpret_cast<const T1 *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T2 *>(out.ptr());
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = internal_vdupq_n(static_cast<T1>(*reinterpret_cast<const T2 *>(bias->ptr_to_element(Coordinates(id.z())))));
+                internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb));
+            }
+            else
+            {
+                internal_vst1q(out_ptr, internal_vld1q(in_ptr));
+            }
+        },
+        in, out);
+    }
+}
+
+// QASYMM8 specializations
+template <>
+void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                 int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+        int32x4x4_t v_in =
+        {
+            {
+                vld1q_s32(in_ptr),
+                vld1q_s32(in_ptr + 4),
+                vld1q_s32(in_ptr + 8),
+                vld1q_s32(in_ptr + 12)
+            }
+        };
+
+        // Accumulate bias
+        const auto vb = vdupq_n_s32(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))));
+        v_in =
+        {
+            {
+                vaddq_s32(v_in.val[0], vb),
+                vaddq_s32(v_in.val[1], vb),
+                vaddq_s32(v_in.val[2], vb),
+                vaddq_s32(v_in.val[3], vb)
+            }
+        };
+
+        const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, out);
+}
+template <>
+void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+                                                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_UNUSED(bias);
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+    uint8x16_t      min                           = vdupq_n_u8(0);
+    uint8x16_t      max                           = vdupq_n_u8(255);
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        // Get bias and pointer to input
+        const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+        int32x4x4_t v_in =
+        {
+            {
+                vld1q_s32(in_ptr),
+                vld1q_s32(in_ptr + 4),
+                vld1q_s32(in_ptr + 8),
+                vld1q_s32(in_ptr + 12)
+            }
+        };
+
+        const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
+        vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+    },
+    in, out);
+}
+} // namespace
+
+NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
+{
+}
+
+void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output,
+                                                          int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    // Auto-initialize output output if required
+    if(output != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (input->info()->data_type() == DataType::S32) ? DataType::QASYMM8 : input->info()->data_type();
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
+    }
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info()));
+
+    _func                         = nullptr;
+    _bias                         = bias;
+    _input                        = input;
+    _output                       = output;
+    _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+    _result_shift                 = result_shift;
+    _result_offset_after_shift    = result_offset_after_shift;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+
+    // Set appropriate function
+    switch(input->info()->data_type())
+    {
+        case DataType::QS8:
+        {
+            if(bias == nullptr)
+            {
+                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
+            }
+            else
+            {
+                _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
+            }
+            break;
+        }
+        case DataType::QS16:
+        {
+            if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
+            {
+                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
+            }
+            else if(bias == nullptr)
+            {
+                _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Not implemented");
+            }
+            break;
+        }
+        case DataType::QS32:
+        {
+            _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
+            break;
+        }
+        case DataType::S32:
+            _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+        {
+            _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+            break;
+        }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+        {
+            _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+        }
+    }
+}
+
+Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), bias->clone().get(), output == nullptr ? nullptr : output->clone().get()).first);
+
+    return Status{};
+}
+
+void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+}

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index af04955..747b8b1 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,7 +105,10 @@
 
 void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QASYMM8,
+                                                  DataType::QS16, DataType::U16, DataType::S16,
+                                                  DataType::U32, DataType::S32,
+                                                  DataType::F16, DataType::F32);
 
     _tensor                = tensor;
     _border_size           = border_size;
@@ -140,6 +143,7 @@
         {
             switch(_tensor->info()->data_type())
             {
+                case DataType::QASYMM8:
                 case DataType::U8:
                     fill_constant_value_single_channel<uint8_t>(window);
                     break;
@@ -184,6 +188,7 @@
         {
             switch(_tensor->info()->data_type())
             {
+                case DataType::QASYMM8:
                 case DataType::U8:
                     fill_replicate_single_channel<uint8_t>(window);
                     break;

diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 9104f0b..a100cd2 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -786,7 +786,7 @@
 
         window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
 
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index f696400..3d41548 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp

@@ -71,16 +71,16 @@
             vector_sum_row_shape.collapse_from(1);
             output_shape.collapse_from(2);
 
-            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+                                            "mm_result tensor must have the same number of batches of output tensor");
 
             if(a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
-                                         && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                         "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 8b3f238..5e14e1a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -96,57 +96,11 @@
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
+} // namespace
 
-template <bool    is_bounded_relu>
-inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8,
-                                        uint8x16_t max_u8)
+namespace arm_compute
 {
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-    in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-    in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-    in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-
-    // Round to the nearest division by a power-of-two using result_shift_s32
-    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
-    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to U8
-    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = vmaxq_u8(out_u8, min_u8);
-        out_u8 = vminq_u8(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
+class Coordinates;
 
 /* Function used by the left-over for loop to perform the quantization */
 template <bool is_bounded_relu>
@@ -178,11 +132,6 @@
 
     return out_u8;
 }
-} // namespace
-
-namespace arm_compute
-{
-class Coordinates;
 } // namespace arm_compute
 
 template <bool is_bounded_relu>

diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 0aadfc9..c1ee770 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,7 +64,7 @@
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
@@ -90,7 +90,7 @@
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index a583c1d..aa5e2dd 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -1408,6 +1408,129 @@
     },
     ina, inb, out);
 }
+
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_UNUSED(input0);
+    ARM_COMPUTE_UNUSED(input1);
+    ARM_COMPUTE_UNUSED(output);
+
+    if(output->dimension(1) == 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    Window win            = Window();
+    bool   window_changed = false;
+
+    unsigned int       num_elems_processed_per_iteration_x = 0;
+    const unsigned int num_elems_processed_per_iteration_y = 4;
+
+    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+    if((output->dimension(1) == 1))
+    {
+        switch(input0->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            case DataType::QS16:
+            {
+                num_elems_processed_per_iteration_x = 16;
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
+
+        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
+
+        window_changed = update_window_and_padding(win,
+                                                   AccessWindowStatic(input0, 0, 0, input0->tensor_shape().x(), 1),
+                                                   AccessWindowHorizontal(input1, 0, num_elems_processed_per_iteration_x),
+                                                   output_access);
+
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
+    }
+    else
+    {
+        switch(input0->data_type())
+        {
+            case DataType::F32:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+            case DataType::QS8:
+            {
+                num_elems_processed_per_iteration_x = 32;
+                break;
+            }
+            case DataType::QS16:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                num_elems_processed_per_iteration_x = 8;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            default:
+            {
+                ARM_COMPUTE_ERROR("Data type not supported");
+                break;
+            }
+        }
+
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+        window_changed = update_window_and_padding(win,
+                                                   AccessWindowRectangle(input0, 0, 0, 4, 1, 1.f, 0.25f),
+                                                   AccessWindowStatic(input1, 0, 0, input1->tensor_shape().x(), ceil_to_multiple(input1->tensor_shape().y(), 4)),
+                                                   output_access);
+
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel()
@@ -1417,120 +1540,27 @@
 
 void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
-
-    if(output->info()->dimension(1) == 1)
-    {
-        ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
-    }
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
     _output = output;
     _alpha  = alpha;
 
-    unsigned int       num_elems_processed_per_iteration_x = 0;
-    const unsigned int num_elems_processed_per_iteration_y = 4;
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((output->info()->dimension(1) == 1))
-    {
-        switch(input0->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                num_elems_processed_per_iteration_x = 16;
-                break;
-            }
-            case DataType::QS8:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-            case DataType::QS16:
-            {
-                num_elems_processed_per_iteration_x = 16;
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
-        }
+Status NEGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
-        // Configure kernel window
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
-
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x);
-
-        update_window_and_padding(win,
-                                  AccessWindowStatic(input0->info(), 0, 0, input0->info()->tensor_shape().x(), 1),
-                                  AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
-                                  output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->info()->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
-
-        INEKernel::configure(win);
-    }
-    else
-    {
-        switch(input0->info()->data_type())
-        {
-            case DataType::F32:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
-            case DataType::QS8:
-            {
-                num_elems_processed_per_iteration_x = 32;
-                break;
-            }
-            case DataType::QS16:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                num_elems_processed_per_iteration_x = 8;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            default:
-            {
-                ARM_COMPUTE_ERROR("Data type not supported");
-                break;
-            }
-        }
-
-        // Configure kernel window
-        Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win,
-                                  AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f),
-                                  AccessWindowStatic(input1->info(), 0, 0, input1->info()->tensor_shape().x(), ceil_to_multiple(input1->info()->tensor_shape().y(), 4)),
-                                  output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-        INEKernel::configure(win);
-    }
+    return Status{};
 }
 
 void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index a88dc65..5d6163d 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,7 +89,7 @@
     {
         AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
         window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 52e3006..58da040 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -333,7 +333,7 @@
                               AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x),
                               output_access);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
 }

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 47372c2..ff4802c 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 #include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -35,6 +36,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "support/ToolchainSupport.h"
+
 #include <algorithm>
 #include <arm_neon.h>
 #include <cmath>
@@ -98,6 +101,56 @@
     return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
 }
 
+template <bool exclude_padding>
+inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
+                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = (id.x() + id_offset) * stride_x - pad_x;
+    int       start_y = id.y() * stride_y - pad_y;
+    const int end_y   = std::min(start_y + pool_size, upper_bound_h);
+    if(exclude_padding)
+    {
+        start_y = std::max(0, start_y);
+    }
+
+    std::array<uint16_t, 8> elems =
+    {
+        {
+            vgetq_lane_u16(v, 0),
+            vgetq_lane_u16(v, 1),
+            vgetq_lane_u16(v, 2),
+            vgetq_lane_u16(v, 3),
+            vgetq_lane_u16(v, 4),
+            vgetq_lane_u16(v, 5),
+            vgetq_lane_u16(v, 6),
+            vgetq_lane_u16(v, 7),
+        }
+    };
+
+    for(auto &el : elems)
+    {
+        int       c_start_x = start_x;
+        const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
+        if(exclude_padding)
+        {
+            c_start_x = std::max(0, c_start_x);
+        }
+        float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
+        el *= scale;
+        start_x += step * stride_x;
+    }
+
+    v = vsetq_lane_u16(elems[0], v, 0);
+    v = vsetq_lane_u16(elems[1], v, 1);
+    v = vsetq_lane_u16(elems[2], v, 2);
+    v = vsetq_lane_u16(elems[3], v, 3);
+    v = vsetq_lane_u16(elems[4], v, 4);
+    v = vsetq_lane_u16(elems[5], v, 5);
+    v = vsetq_lane_u16(elems[6], v, 6);
+    v = vsetq_lane_u16(elems[7], v, 7);
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, unsigned int &pooled_w, unsigned int pooled_h, int pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -114,9 +167,9 @@
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
     static const std::set<int> supported_pool_sizes = { 2, 3 };
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_fixed_point(input->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && (input->data_type() != DataType::F32));
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)));
     ARM_COMPUTE_RETURN_ERROR_ON(!is_global_pooling && (pool_pad_x >= pool_size || pool_pad_y >= pool_size));
     ARM_COMPUTE_RETURN_ERROR_ON(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()));
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
@@ -185,6 +238,26 @@
             }
             num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
             break;
+        case DataType::QASYMM8:
+            switch(pool_size)
+            {
+                case 2:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
+                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                    break;
+                case 3:
+                    num_elems_read_per_iteration      = 16;
+                    num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
+                    num_elems_horizontal_window       = (pool_stride_x == 2) ? 8 : 16;
+                    break;
+                default:
+                    num_elems_read_per_iteration      = 1;
+                    num_elems_processed_per_iteration = 1;
+                    num_elems_horizontal_window       = 1;
+                    break;
+            }
+            break;
         case DataType::QS16:
             num_elems_read_per_iteration = 8;
             switch(pool_size)
@@ -244,7 +317,11 @@
             break;
     }
 
-    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    // Number of iterations in X dimension
+    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+
+    // Upper limit for the number of right/bottom border elements that are accessed
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
     const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
     border_size         = BorderSize(pool_pad_y, pool_pad_x);
@@ -290,32 +367,25 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    int                 pool_pad_x        = 0;
-    int                 pool_pad_y        = 0;
-    int                 pool_stride_x     = 0;
-    int                 pool_stride_y     = 0;
-    unsigned int        pooled_w          = 0;
-    unsigned int        pooled_h          = 0;
-    PoolingType         pool_type         = pool_info.pool_type();
-    int                 pool_size         = pool_info.pool_size();
+    const PoolingType   pool_type         = pool_info.pool_type();
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
     const bool          exclude_padding   = pool_info.exclude_padding();
     const bool          is_global_pooling = pool_info.is_global_pooling();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int           pool_stride_x     = pad_stride_info.stride().first;
 
     // Update pool size in case of global pooling
-    pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
+    const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size();
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
 
     // Check output dimensions
+    unsigned int pooled_w, pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
                                                      pool_size,
                                                      pool_size,
-                                                     pool_info.pad_stride_info());
+                                                     pad_stride_info);
 
     // Output auto initialization if not yet initialized
     auto_init(input->info(), output->info(), pooled_w, pooled_h);
@@ -328,12 +398,15 @@
     _output    = output;
     _pool_info = pool_info;
 
+    // Get data type
+    const DataType data_type = input->info()->data_type();
+
     // Select appropriate function
-    switch(pool_size)
+    if(data_type == DataType::QS8)
     {
-        case 2:
-            if(input->info()->data_type() == DataType::QS8)
-            {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -345,9 +418,74 @@
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::QS16)
+                break;
+            case 3:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::QASYMM8)
+    {
+        if(pool_size == 2 && pool_stride_x < 3)
+        {
+            switch(pool_type)
             {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling2_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
+        }
+        else if(pool_size == 3 && pool_stride_x < 3)
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::pooling3_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
+        }
+        else
+        {
+            switch(pool_type)
+            {
+                case PoolingType::AVG:
+                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::AVG, false>;
+                    break;
+                case PoolingType::MAX:
+                    _func = &NEPoolingLayerKernel::poolingN_qasymm8<PoolingType::MAX>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+            }
+        }
+    }
+    else if(data_type == DataType::QS16)
+    {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -359,9 +497,29 @@
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F16)
-            {
+                break;
+            case 3:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::F16)
+    {
+        switch(pool_size)
+        {
+            case 2:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -376,56 +534,8 @@
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F32)
-            {
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
-                        break;
-                    case PoolingType::L2:
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-            }
-            break;
-        case 3:
-            if(input->info()->data_type() == DataType::QS8)
-            {
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q8<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-            }
-            else if(input->info()->data_type() == DataType::QS16)
-            {
-                switch(pool_type)
-                {
-                    case PoolingType::AVG:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::AVG>;
-                        break;
-                    case PoolingType::MAX:
-                        _func = &NEPoolingLayerKernel::pooling3_q16<PoolingType::MAX>;
-                        break;
-                    default:
-                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                }
-            }
-            else if(input->info()->data_type() == DataType::F16)
-            {
+                break;
+            case 3:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -440,9 +550,32 @@
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            else if(input->info()->data_type() == DataType::F32)
-            {
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Unsupported pooling size!");
+        }
+    }
+    else if(data_type == DataType::F32)
+    {
+        switch(pool_size)
+        {
+            case 2:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::AVG, false>;
+                        break;
+                    case PoolingType::L2:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32<PoolingType::L2, false>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling2_f32<PoolingType::MAX, false>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            case 3:
                 switch(pool_type)
                 {
                     case PoolingType::AVG:
@@ -457,40 +590,40 @@
                     default:
                         ARM_COMPUTE_ERROR("Unsupported pooling type!");
                 }
-            }
-            break;
-        case 7:
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
-                    break;
-                case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
-                    break;
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-            break;
-        default:
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
-                    break;
-                case PoolingType::L2:
-                    _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
-                    break;
-                case PoolingType::MAX:
-                    _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
-            }
-            break;
+                break;
+            case 7:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::AVG, false>;
+                        break;
+                    case PoolingType::L2:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32<PoolingType::L2, false>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::pooling7_f32<PoolingType::MAX, false>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            default:
+                switch(pool_type)
+                {
+                    case PoolingType::AVG:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::AVG, false>;
+                        break;
+                    case PoolingType::L2:
+                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingN_f32<PoolingType::L2, false>;
+                        break;
+                    case PoolingType::MAX:
+                        _func = &NEPoolingLayerKernel::poolingN_f32<PoolingType::MAX, false>;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+        }
     }
 
     // Configure kernel window
@@ -563,6 +696,119 @@
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::pooling2_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size     = 2;
+    int           pool_pad_x    = 0;
+    int           pool_pad_y    = 0;
+    int           pool_stride_x = 0;
+    int           pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+
+    const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
+        const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+        uint8x8_t  lower_res   = {};
+        uint8x8_t  upper_res   = {};
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            const uint16x8x2_t top_data_u16    = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
+            const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
+
+            // Add rows
+            const uint16x8x2_t vrsum =
+            {
+                {
+                    vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]),
+                    vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]),
+                }
+            };
+
+            // Pair-wise add row data
+            const uint16x4x2_t vpsum =
+            {
+                {
+                    vpadd_u16(vget_low_u16(vrsum.val[0]), vget_high_u16(vrsum.val[0])),
+                    vpadd_u16(vget_low_u16(vrsum.val[1]), vget_high_u16(vrsum.val[1])),
+                }
+            };
+
+            uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
+
+            // Scale lower result
+            scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
+                                                pool_size, upper_bound_w, upper_bound_h,
+                                                pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+            lower_res = vmovn_u16(res_lower);
+
+            // Compute upper result for stride_x == 1
+            if(pool_stride_x == 1)
+            {
+                // Shifted row sum
+                const uint16x8x2_t vrsum_shifted =
+                {
+                    {
+                        vextq_u16(vrsum.val[0], vrsum.val[1], 1),
+                        vextq_u16(vrsum.val[1], vrsum.val[1], 1)
+                    }
+                };
+
+                // Pair-wise add shifted row
+                const uint16x4x2_t vpsum_shifted =
+                {
+                    {
+                        vpadd_u16(vget_low_u16(vrsum_shifted.val[0]), vget_high_u16(vrsum_shifted.val[0])),
+                        vpadd_u16(vget_low_u16(vrsum_shifted.val[1]), vget_high_u16(vrsum_shifted.val[1])),
+                    }
+                };
+                uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
+
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                upper_res = vmovn_u16(res_upper);
+            }
+        }
+        else
+        {
+            const uint8x16_t max_data = vmaxq_u8(top_data, bottom_data);
+            lower_res                 = vpmax_u8(vget_low_u8(max_data), vget_high_u8(max_data));
+            if(pool_stride_x == 1)
+            {
+                const uint8x16_t max_data_shifted = vextq_u8(max_data, max_data, 1);
+                upper_res                         = vpmax_u8(vget_low_u8(max_data_shifted), vget_high_u8(max_data_shifted));
+            }
+        }
+
+        // Store result
+        if(pool_stride_x == 1)
+        {
+            const uint8x8x2_t res = { { lower_res, upper_res } };
+            vst2_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+        }
+        else
+        {
+            vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), lower_res);
+        }
+    },
+    input, output);
+}
+
 template <PoolingType pooling_type>
 void NEPoolingLayerKernel::pooling2_q16(const Window &window_input, const Window &window)
 {
@@ -892,6 +1138,125 @@
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::pooling3_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size     = 3;
+    int           pool_pad_x    = 0;
+    int           pool_pad_y    = 0;
+    int           pool_stride_x = 0;
+    int           pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_x), -static_cast<int>(pool_pad_y) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto top_data    = vld1q_u8(reinterpret_cast<const uint8_t *>(input_top_ptr + input.offset()));
+        const auto middle_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_middle_ptr + input.offset()));
+        const auto bottom_data = vld1q_u8(reinterpret_cast<const uint8_t *>(input_bottom_ptr + input.offset()));
+
+        if(pooling_type == PoolingType::AVG)
+        {
+            // Convert data to u16
+            const uint16x8x2_t top_data_u16    = { { vmovl_u8(vget_low_u8(top_data)), vmovl_u8(vget_high_u8(top_data)) } };
+            const uint16x8x2_t middle_data_u16 = { { vmovl_u8(vget_low_u8(middle_data)), vmovl_u8(vget_high_u8(middle_data)) } };
+            const uint16x8x2_t bottom_data_u16 = { { vmovl_u8(vget_low_u8(bottom_data)), vmovl_u8(vget_high_u8(bottom_data)) } };
+
+            // Calculate row sums
+            const uint16x8x2_t vrsum =
+            {
+                {
+                    vaddq_u16(vaddq_u16(top_data_u16.val[0], bottom_data_u16.val[0]), middle_data_u16.val[0]),
+                    vaddq_u16(vaddq_u16(top_data_u16.val[1], bottom_data_u16.val[1]), middle_data_u16.val[1]),
+                }
+            };
+            const uint16x8x2_t vrsum_shifted_1 =
+            {
+                {
+                    vextq_u16(vrsum.val[0], vrsum.val[1], 1),
+                    vextq_u16(vrsum.val[1], vrsum.val[1], 1)
+                }
+            };
+            const uint16x8x2_t vrsum_shifted_2 =
+            {
+                {
+                    vextq_u16(vrsum.val[0], vrsum.val[1], 2),
+                    vextq_u16(vrsum.val[1], vrsum.val[1], 2)
+                }
+            };
+            // Calculate final sum
+            uint16x8x2_t final_sum =
+            {
+                {
+                    vaddq_u16(vaddq_u16(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
+                    vaddq_u16(vaddq_u16(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
+                }
+            };
+            if(pool_stride_x == 2)
+            {
+                uint16x8_t res =
+                {
+                    vgetq_lane_u16(final_sum.val[0], 0),
+                    vgetq_lane_u16(final_sum.val[0], 2),
+                    vgetq_lane_u16(final_sum.val[0], 4),
+                    vgetq_lane_u16(final_sum.val[0], 6),
+                    vgetq_lane_u16(final_sum.val[1], 0),
+                    vgetq_lane_u16(final_sum.val[1], 2),
+                    vgetq_lane_u16(final_sum.val[1], 4),
+                    vgetq_lane_u16(final_sum.val[1], 6),
+                };
+
+                scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
+            }
+            else
+            {
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                // Scale lower result
+                scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
+                                                    pool_size, upper_bound_w, upper_bound_h,
+                                                    pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+            }
+        }
+        else
+        {
+            const uint8x16_t max_data        = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
+            const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
+            const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
+            const uint8x16_t final_max       = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
+
+            if(pool_stride_x == 2)
+            {
+                const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
+                static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+            }
+            else
+            {
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
+            }
+        }
+    },
+    input, output);
+}
+
 template <PoolingType pooling_type>
 void NEPoolingLayerKernel::pooling3_q16(const Window &window_input, const Window &window)
 {
@@ -1232,6 +1597,98 @@
     input, output);
 }
 
+template <PoolingType pooling_type, bool exclude_padding>
+void NEPoolingLayerKernel::poolingN_qasymm8(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    const int pool_size     = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size();
+    int       pool_pad_x    = 0;
+    int       pool_pad_y    = 0;
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
+    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info().pad();
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        uint8_t res = 0;
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            uint32x4_t vres = vdupq_n_u32(0);
+            uint32_t   sres = 0;
+
+            // Calculate scale
+            const float scale = calculate_avg_scale<exclude_padding>(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y);
+
+            // Perform pooling
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 8); x += 8)
+                {
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+
+                    const uint16x8_t data_u16 = vmovl_u8(data);
+                    vres                      = vaddq_u32(vres, vaddl_u16(vget_high_u16(data_u16), vget_low_u16(data_u16)));
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    sres += data;
+                }
+            }
+
+            // Reduction
+            const auto tmp = vpadd_u32(vget_high_u32(vres), vget_low_u32(vres));
+            sres += vget_lane_u32(tmp, 0) + vget_lane_u32(tmp, 1);
+
+            // Divide by scale
+            res = static_cast<uint8_t>(support::cpp11::round(sres * scale));
+        }
+        else
+        {
+            uint8x8_t vres = vdup_n_u8(0);
+            res            = 0;
+
+            for(int y = 0; y < pool_size; ++y)
+            {
+                int x = 0;
+                for(; x <= (pool_size - 8); x += 8)
+                {
+                    const uint8x8_t data = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    vres                 = vmax_u8(vres, data);
+                }
+
+                // Leftover for loop
+                for(; x < pool_size; ++x)
+                {
+                    const uint8_t data = *(reinterpret_cast<const uint8_t *>(input.ptr() + (x - pool_pad_x) * _input->info()->strides_in_bytes().x() + (y - pool_pad_y) * _input->info()->strides_in_bytes().y()));
+                    res                = std::max(res, data);
+                }
+            }
+
+            // Reduce max
+            vres = vpmax_u8(vres, vres);
+            vres = vpmax_u8(vres, vres);
+            vres = vpmax_u8(vres, vres);
+
+            // Get max value
+            res = std::max(res, vget_lane_u8(vres, 0));
+        }
+
+        // Store result
+        *(reinterpret_cast<uint8_t *>(output.ptr())) = res;
+    },
+    input, output);
+}
+
 Status NEPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
@@ -1269,6 +1726,7 @@
 
     const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
     const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
+    const unsigned int pool_size     = _pool_info.pool_size();
 
     // Set step for input in x and y direction for the input
     Window       window_input(window);
@@ -1282,6 +1740,15 @@
             window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
             break;
         }
+        case DataType::QASYMM8:
+        {
+            window_x_inc = pool_stride_x;
+            if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+            {
+                window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+            }
+            break;
+        }
         case DataType::F32:
         {
             window_x_inc = pool_stride_x;

diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index ab8ab14..0fa8278 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp

@@ -287,10 +287,10 @@
                     break;
                 case InterpolationPolicy::BILINEAR:
                 {
-                    const auto xi   = clamp<int>(std::floor(x0), min_x - 1, max_x);
-                    const auto yi   = clamp<int>(std::floor(y0), min_y - 1, max_y);
-                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
-                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
+                    const auto xi   = utility::clamp<int>(std::floor(x0), min_x - 1, max_x);
+                    const auto yi   = utility::clamp<int>(std::floor(y0), min_y - 1, max_y);
+                    const auto xi_1 = utility::clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
+                    const auto yi_1 = utility::clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
 
                     const float dx  = x0 - std::floor(x0);
                     const float dy  = y0 - std::floor(y0);
@@ -396,8 +396,8 @@
         else
         {
             // Clamp coordinates
-            const auto xi = clamp<int>(std::floor(x0), min_x, max_x - 1);
-            const auto yi = clamp<int>(std::floor(y0), min_y, max_y - 1);
+            const auto xi = utility::clamp<int>(std::floor(x0), min_x, max_x - 1);
+            const auto yi = utility::clamp<int>(std::floor(y0), min_y, max_y - 1);
             switch(interpolation)
             {
                 case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -405,8 +405,8 @@
                     break;
                 case InterpolationPolicy::BILINEAR:
                 {
-                    const auto xi_1 = clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
-                    const auto yi_1 = clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
+                    const auto xi_1 = utility::clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
+                    const auto yi_1 = utility::clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
 
                     const float dx  = x0 - std::floor(x0);
                     const float dy  = y0 - std::floor(y0);
@@ -636,10 +636,10 @@
                     break;
                 case InterpolationPolicy::BILINEAR:
                 {
-                    const auto xi   = clamp<int>(std::floor(xn), min_x - 1, max_x);
-                    const auto yi   = clamp<int>(std::floor(yn), min_y - 1, max_y);
-                    const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x - 1, max_x);
-                    const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y - 1, max_y);
+                    const auto xi   = utility::clamp<int>(std::floor(xn), min_x - 1, max_x);
+                    const auto yi   = utility::clamp<int>(std::floor(yn), min_y - 1, max_y);
+                    const auto xi_1 = utility::clamp<int>(std::floor(xn + 1), min_x - 1, max_x);
+                    const auto yi_1 = utility::clamp<int>(std::floor(yn + 1), min_y - 1, max_y);
 
                     const float dx  = xn - std::floor(xn);
                     const float dy  = yn - std::floor(yn);
@@ -762,8 +762,8 @@
         else
         {
             // Clamp coordinates
-            const auto xi = clamp<int>(std::floor(xn), min_x, max_x - 1);
-            const auto yi = clamp<int>(std::floor(yn), min_y, max_y - 1);
+            const auto xi = utility::clamp<int>(std::floor(xn), min_x, max_x - 1);
+            const auto yi = utility::clamp<int>(std::floor(yn), min_y, max_y - 1);
             switch(interpolation)
             {
                 case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -771,8 +771,8 @@
                     break;
                 case InterpolationPolicy::BILINEAR:
                 {
-                    const auto xi_1 = clamp<int>(std::floor(xn + 1), min_x, max_x - 1);
-                    const auto yi_1 = clamp<int>(std::floor(yn + 1), min_y, max_y - 1);
+                    const auto xi_1 = utility::clamp<int>(std::floor(xn + 1), min_x, max_x - 1);
+                    const auto yi_1 = utility::clamp<int>(std::floor(yn + 1), min_y, max_y - 1);
 
                     const float dx  = xn - std::floor(xn);
                     const float dy  = yn - std::floor(yn);

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index d52e88c..794c179 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -86,6 +86,57 @@
     },
     in);
 }
+
+TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
+{
+    TensorShape output_shape{ input->tensor_shape() };
+
+    output_shape.collapse(3);
+    const size_t tmp_dim = output_shape[0];
+    output_shape.set(0, output_shape[1]);
+    output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
+
+    return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+    }
+
+    // Checks performed when output is configured
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    Window window = calculate_max_window(*input, Steps());
+    window.set(Window::DimX, Window::Dimension(0, input->dimension(0), input->dimension(0)));
+    window.set(Window::DimY, Window::Dimension(0, input->dimension(1), input->dimension(1)));
+    window.set(Window::DimZ, Window::Dimension(0, input->dimension(2), input->dimension(2)));
+
+    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    return std::make_pair(Status{}, window);
+}
 } // namespace
 
 NEWeightsReshapeKernel::NEWeightsReshapeKernel()
@@ -95,35 +146,15 @@
 
 void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    const int          fixed_point_position = input->info()->fixed_point_position();
-    const DataType     dt                   = input->info()->data_type();
-    const TensorShape &input_shape          = input->info()->tensor_shape();
-    TensorShape        output_shape{ input_shape };
-    output_shape.collapse(3);
-
-    const size_t tmp_dim = output_shape[0];
-    output_shape.set(0, output_shape[1]);
-    output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, dt, fixed_point_position);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), (bias != nullptr))));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->num_dimensions() != 1));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->num_dimensions() != 2));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3]));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (bias->info()->dimension(0) != input->info()->tensor_shape()[3] || bias->info()->dimension(1) != input->info()->tensor_shape()[4]));
-    }
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info()));
 
     _input  = input;
     _bias   = bias;
@@ -154,15 +185,17 @@
     }
 
     // Configure kernel
-    Window window = calculate_max_window(*input->info(), Steps());
-    window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
-    window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
-    window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+Status NEWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(window);
+    return Status{};
 }
 
 void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
index fe63336..ea48e1f 100644
--- a/src/core/NEON/kernels/NEWinogradLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,20 +29,35 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "support/ToolchainSupport.h"
 
-#include "src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp"
+#include "arm_compute/core/NEON/kernels/winograd/winograd_layer.hpp"
 
-using T = winograd_shim_nchw::Winograd2x2_3x3GEMM<float, float>;
+namespace
+{
+using T = WinogradConvolutionLayer<2, 2, 3, 3, float, float>;
+} // namespace
 
 namespace arm_compute
 {
 class Winograd3x3F32::Private
 {
 public:
-    Private(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage)
-        : convolver(kernel_shape, input_shape, padding_type, kernel_storage)
+    Private(
+        const int          n_batches,         /** Number of batches in the input and output tensors. */
+        const int          n_input_channels,  /** Number of feature maps in a batch of the input tensor. */
+        const int          n_input_rows,      /** Number of rows in a feature map of the input tensor. */
+        const int          n_input_cols,      /** Number of columns in a feature map of the input tensor. */
+        const int          n_output_channels, /** Number of feature maps in the output tensor. */
+        const bool         same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
+        const float *const weights,           /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
+        float *const       weights_storage,   /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
+        const float *const input,             /** Pointer to NHWC ordered input tensor, in the spatial domain. */
+        float *const       winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
+        float *const       output,            /** Pointer to NHWC ordered output tensor, in the spatial domain. */
+        float *const       winograd_output    /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
+    )
+        : convolver(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output, winograd_output)
     {
     }
-
     T convolver;
 };
 
@@ -50,88 +65,85 @@
 {
 }
 
-void Winograd3x3F32::nchw2nhwc(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, const void *const input)
+void Winograd3x3F32::transform_output()
 {
-    _pimpl->convolver.nchw2nhwc(input_shape, padding_type, working_space, reinterpret_cast<const float *>(input));
+    auto win = _pimpl->convolver.output_transform.get_window();
+    _pimpl->convolver.output_transform.run(0, win);
 }
 
-void Winograd3x3F32::nhwc2nchw(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, void *const output)
+void Winograd3x3F32::transform_input()
 {
-    _pimpl->convolver.nhwc2nchw(input_shape, padding_type, working_space, reinterpret_cast<float *const>(output));
+    auto win = _pimpl->convolver.input_transform.get_window();
+    _pimpl->convolver.input_transform.run(0, win);
 }
 
-void Winograd3x3F32::transform_weights(const void *const kernel, void *transform_working_space)
+void Winograd3x3F32::transform_weights()
 {
-    _pimpl->convolver.transform_weights(reinterpret_cast<const float *>(kernel), transform_working_space);
+    auto win = _pimpl->convolver.weights_transform.get_window();
+    _pimpl->convolver.weights_transform.run(0, win);
 }
 
-void Winograd3x3F32::reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const void *const input, void *working_space)
-{
-    _pimpl->convolver.reshape_input(input_shape, padding_type, reinterpret_cast<const float *>(input), working_space);
-}
-
-void Winograd3x3F32::reshape_output(const Tensor4DShape &input_shape, const PaddingType padding_type, void *const output)
-{
-#if defined(__aarch64__)
-    _pimpl->convolver.reshape_output(input_shape, padding_type, reinterpret_cast<float *const>(output));
-#else  /* __aarch64__ */
-    ARM_COMPUTE_UNUSED(input_shape);
-    ARM_COMPUTE_UNUSED(padding_type);
-    ARM_COMPUTE_UNUSED(output);
-    ARM_COMPUTE_ERROR("Not implemented");
-#endif /* __aarch64__ */
-}
-
-std::pair<void *, void *> Winograd3x3F32::get_nhwc_ptrs(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space)
-{
-    return _pimpl->convolver.get_nhwc_ptrs(input_shape, padding_type, working_space);
-}
-
-Winograd3x3F32::Winograd3x3F32(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage)
-    : _pimpl(support::cpp14::make_unique<Private>(kernel_shape, input_shape, padding_type, kernel_storage))
+Winograd3x3F32::Winograd3x3F32(
+    const int          n_batches,         /** Number of batches in the input and output tensors. */
+    const int          n_input_channels,  /** Number of feature maps in a batch of the input tensor. */
+    const int          n_input_rows,      /** Number of rows in a feature map of the input tensor. */
+    const int          n_input_cols,      /** Number of columns in a feature map of the input tensor. */
+    const int          n_output_channels, /** Number of feature maps in the output tensor. */
+    const bool         same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
+    const float *const weights,           /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
+    float *const       weights_storage,   /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
+    const float *const input,             /** Pointer to NHWC ordered input tensor, in the spatial domain. */
+    float *const       winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
+    float *const       output,            /** Pointer to NHWC ordered output tensor, in the spatial domain. */
+    float *const       winograd_output    /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
+)
+    : _pimpl(support::cpp14::make_unique<Private>(n_batches, n_input_channels, n_input_rows, n_input_cols, n_output_channels, same_padding, weights, weights_storage, input, winograd_input, output,
+                                                  winograd_output))
 {
 }
 
-size_t NEWinogradLayerKernel::get_kernel_storage_size(const KernelShape &shape)
+unsigned int NEWinogradLayerKernel::get_input_storage_size(const int n_batches, const int n_channels, const int n_rows, const int n_cols, const bool same_padding)
 {
-    return T::get_kernel_storage_size(shape);
+    return T::get_input_storage_size(n_batches, n_channels, n_rows, n_cols, same_padding);
 }
 
-size_t NEWinogradLayerKernel::get_working_space_size(const Tensor4DShape &input_shape, const KernelShape &k_shape, const PaddingType padding)
+unsigned int NEWinogradLayerKernel::get_output_storage_size(
+    const int  n_batches,         /** Number of batches in the output tensor. */
+    const int  n_rows,            /** Number of rows in each feature map of the input tensor. */
+    const int  n_cols,            /** Number of columns in each feature map of the input tensor. */
+    const int  n_output_channels, /** Number of feature maps in the output tensor. */
+    const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
+)
 {
-    return T::get_working_space_size(input_shape, k_shape, padding);
+    return T::get_output_storage_size(n_batches, n_rows, n_cols, n_output_channels, same_padding);
 }
 
-size_t NEWinogradLayerKernel::get_kernel_transform_working_size(const KernelShape &shape)
+unsigned int NEWinogradLayerKernel::get_weight_storage_size(const int n_output_channels, const int n_input_channels)
 {
-    return T::get_kernel_transform_working_size(shape);
+    return T::get_weight_storage_size(n_output_channels, n_input_channels);
 }
 
 NEWinogradLayerKernel::NEWinogradLayerKernel()
-    : _convolver(nullptr), _output(nullptr)
+    : _convolver(nullptr)
 {
 }
 
-void NEWinogradLayerKernel::configure(ITensor *output, Winograd3x3F32 *convolver)
+void NEWinogradLayerKernel::configure(Winograd3x3F32 *convolver)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(convolver);
     _convolver = convolver;
-    Window win = calculate_max_window(*output->info());
+    Window win;
+    auto   win_last = _convolver->_pimpl->convolver.gemms.get_window();
+    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
     INEKernel::configure(win);
 }
 
 void NEWinogradLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
-    ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(info.num_threads < 1);
-    const size_t tid                  = info.thread_id;
-    const size_t num_threads          = std::min(info.num_threads, 16);
-    const size_t num_gemms_per_thread = 16 / num_threads;
-    const size_t first_gemm           = tid * num_gemms_per_thread;
-    const size_t last_gemm            = (tid == (num_threads - 1)) ? 15 : first_gemm + num_gemms_per_thread - 1;
-    _convolver->_pimpl->convolver.execute(first_gemm, last_gemm);
+    const size_t first_gemm = window.x().start();
+    const size_t last_gemm  = window.x().end();
+    _convolver->_pimpl->convolver.gemms.run(first_gemm, last_gemm);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
index ad0743b..bffcbbf 100644
--- a/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp
+++ b/src/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.cpp

@@ -50,20 +50,20 @@
 
 namespace arm_compute
 {
-void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMAArch32Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info());
@@ -104,7 +104,7 @@
     Iterator in0(_input0, window);
     Iterator out(_output, window);
 
-    GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    GemmInterleaved<sgemm_8x6, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
     void            *workspace      = _workspace->buffer() + offset;

diff --git a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
index d70524b..0eaa9aa 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.cpp

@@ -50,20 +50,20 @@
 
 namespace arm_compute
 {
-void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info());
@@ -104,7 +104,7 @@
     Iterator in0(_input0, window);
     Iterator out(_output, window);
 
-    GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    GemmInterleaved<sgemm_12x8, float, float> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
     void            *workspace      = _workspace->buffer() + offset;

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
index e020cd9..80606dc 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.cpp

@@ -56,7 +56,8 @@
 {
 }
 
-void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_s16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
+                               const Window     &window,
                                const ThreadInfo &info)
 {
     const int lda = input0->info()->strides_in_bytes().y();
@@ -77,7 +78,7 @@
     Iterator in0(input0, window);
     Iterator out(output, window);
 
-    GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+    GemmInterleaved<gemm_s16_12x8, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
 
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -99,7 +100,8 @@
     in0, out);
 }
 
-void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_u16_12x8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1,
+                               const Window     &window,
                                const ThreadInfo &info)
 {
     const int lda = input0->info()->strides_in_bytes().y();
@@ -120,7 +122,7 @@
     Iterator in0(input0, window);
     Iterator out(output, window);
 
-    GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+    GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
 
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -142,20 +144,21 @@
     in0, out);
 }
 
-void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64A53Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+                                                    bool is_transposed_1)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     switch(input0->info()->data_type())
     {
@@ -192,7 +195,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
 }
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
index db37201..38f82f0 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.cpp

@@ -56,7 +56,7 @@
 {
 }
 
-void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_s8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
                          const ThreadInfo &info)
 {
     const int lda = input0->info()->strides_in_bytes().y();
@@ -77,7 +77,7 @@
     Iterator in0(input0, window);
     Iterator out(output, window);
 
-    GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+    GemmInterleaved<gemm_s8_4x4, int8_t, int32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
 
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -99,7 +99,7 @@
     in0, out);
 }
 
-void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window,
+void gemm_interleaved_u8(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0, bool is_transposed_1, const Window &window,
                          const ThreadInfo &info)
 {
     const int lda = input0->info()->strides_in_bytes().y();
@@ -120,7 +120,7 @@
     Iterator in0(input0, window);
     Iterator out(output, window);
 
-    GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, !transform_1, !transform_1);
+    GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
 
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
@@ -142,20 +142,21 @@
     in0, out);
 }
 
-void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+                                                 bool is_transposed_1)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::U32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     switch(input0->info()->data_type())
     {
@@ -192,7 +193,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _transform_0, _transform_1, window, info);
+    (*_func)(_input0, _input1, _output, _workspace, _alpha, _beta, _is_transposed_0, _is_transposed_1, window, info);
 }
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_AARCH64_V8A */

diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
index e996e57..7827bc1 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp

@@ -38,6 +38,7 @@
 namespace arm_compute
 {
 #include "arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp"
+#include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp"
 #include "arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp"
 } // namespace arm_compute
 
@@ -54,7 +55,7 @@
 
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::U8, DataType::S8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
@@ -80,24 +81,57 @@
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
+
+template <typename strategy, typename To, typename Tr>
+void *align_workspace(GemmInterleaved<strategy, To, Tr> &gemm, const ThreadInfo &info, ITensor *ws)
+{
+    constexpr size_t alignment      = 4096;
+    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
+    void            *workspace      = ws->buffer() + offset;
+    size_t           workspace_size = ws->info()->total_size();
+
+    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    {
+        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+    }
+    return workspace;
+}
+
+template <typename strategy>
+void execute_gemm(const Window &win, Iterator &in0, Iterator &in1, Iterator &out,
+                  const ThreadInfo &info, ITensor *ws, int M, int N, int K, bool is_transposed_0, bool is_transposed_1,
+                  int lda, int ldb, int ldc, float alpha, float beta)
+{
+    GemmInterleaved<strategy, typename strategy::operand_type, typename strategy::result_type> gemm(&info.cpu_info, M, N, K, is_transposed_0, is_transposed_1);
+    void *workspace = align_workspace(gemm, info, ws);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        gemm.execute(reinterpret_cast<const typename strategy::operand_type *>(in0.ptr()), lda,
+                     reinterpret_cast<const typename strategy::operand_type *>(in1.ptr()), ldb,
+                     reinterpret_cast<typename strategy::result_type *>(out.ptr()), ldc,
+                     alpha, beta, workspace);
+    },
+    in0, out);
+}
 } // namespace
 
 namespace arm_compute
 {
-void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+                                                     bool is_transposed_1)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
@@ -122,8 +156,6 @@
     const int ldb = _input1->info()->strides_in_bytes().y();
     const int ldc = _output->info()->strides_in_bytes().y() / sizeof(uint32_t);
 
-    const auto in1_ptr = reinterpret_cast<const gemm_u8_12x8::operand_type *>(_input1->buffer());
-
     const int M = std::min(_output->info()->tensor_shape().y(), static_cast<size_t>(window.y().end())) - window.y().start();
     const int N = _output->info()->tensor_shape().x();
     const int K = _input0->info()->tensor_shape().x();
@@ -134,28 +166,28 @@
     win.set(1, Window::Dimension(0, 1, 1));
 
     Iterator in0(_input0, window);
+    Iterator in1(_input1, window);
     Iterator out(_output, window);
 
-    GemmInterleaved<gemm_u8_12x8, gemm_u8_12x8::operand_type, gemm_u8_12x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_1, !_transform_1);
-
-    constexpr size_t alignment      = 4096;
-    const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
-    void            *workspace      = _workspace->buffer() + offset;
-    size_t           workspace_size = _workspace->info()->total_size();
-
-    if(support::cpp11::align(alignment, gemm.get_working_size(), workspace, workspace_size) == nullptr)
+    switch(_input0->info()->data_type())
     {
-        ARM_COMPUTE_ERROR("Not enough space to align buffer!");
+        case DataType::QASYMM8:
+        case DataType::U8:
+        {
+            execute_gemm<gemm_u8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
+            break;
+        }
+        case DataType::S8:
+        {
+            execute_gemm<gemm_s8_12x8>(win, in0, in1, out, info, _workspace, M, N, K, _is_transposed_0, _is_transposed_1, lda, ldb, ldc, _alpha, _beta);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
     }
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        gemm.execute(reinterpret_cast<const gemm_u8_12x8::operand_type *>(in0.ptr()), lda,
-                     reinterpret_cast<const gemm_u8_12x8::operand_type *>(in1_ptr), ldb,
-                     reinterpret_cast<gemm_u8_12x8::result_type *>(out.ptr()), ldc,
-                     _alpha, _beta, workspace);
-    },
-    in0, out);
 }
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_AARCH64_V8_2 */

diff --git a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
index 2256304..38b9102 100644
--- a/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.cpp

@@ -50,20 +50,21 @@
 
 namespace arm_compute
 {
-void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
+void NEHGEMMAArch64FP16Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool is_transposed_0,
+                                                  bool is_transposed_1)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
 
-    _input0      = input0;
-    _input1      = input1;
-    _output      = output;
-    _workspace   = workspace;
-    _alpha       = alpha;
-    _beta        = beta;
-    _transform_0 = transform_0;
-    _transform_1 = transform_1;
+    _input0          = input0;
+    _input1          = input1;
+    _output          = output;
+    _workspace       = workspace;
+    _alpha           = alpha;
+    _beta            = beta;
+    _is_transposed_0 = is_transposed_0;
+    _is_transposed_1 = is_transposed_1;
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info());
@@ -105,7 +106,7 @@
     Iterator in0(_input0, window);
     Iterator out(_output, window);
 
-    GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, !_transform_0, !_transform_1);
+    GemmInterleaved<hgemm_24x8, hgemm_24x8::operand_type, hgemm_24x8::result_type> gemm(&info.cpu_info, M, N, K, _is_transposed_0, _is_transposed_1);
     constexpr size_t alignment      = 4096;
     const size_t     offset         = (gemm.get_working_size() + alignment - 1) * info.thread_id;
     void            *workspace      = _workspace->buffer() + offset;

diff --git a/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp b/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp
new file mode 100644
index 0000000..52c2db8
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/batched_blocked_gemm.cpp

@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "batched_blocked_gemm.hpp"
+#include "gemm.hpp"
+using namespace winograd;
+
+template <const int MB, const int NB, typename TIn, typename TOut>
+BatchedBlockedGemm<MB, NB, TIn, TOut>::BatchedBlockedGemm(
+  const unsigned int n_gemms,
+  const int M, const int K, const int N,
+  const int a_matrix_stride,
+  const int a_row_stride,
+  const int b_matrix_stride,
+  const int b_row_stride,
+  const int c_matrix_stride,
+  const int c_row_stride,
+  const TIn* const a_ptr,
+  const TIn* const b_ptr,
+  TOut* const c_ptr
+) : n_gemms(n_gemms), M(M), N(N), K(K),
+    a_matrix_stride(a_matrix_stride),
+    a_row_stride(a_row_stride),
+    b_matrix_stride(b_matrix_stride),
+    b_row_stride(b_row_stride),
+    c_matrix_stride(c_matrix_stride),
+    c_row_stride(c_row_stride),
+    a_ptr(a_ptr), b_ptr(b_ptr), c_ptr(c_ptr)
+{
+}
+
+template <const int MBlock, const int NBlock, typename TIn, typename TOut>
+unsigned int BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::get_window() const
+{
+  return n_gemms;
+}
+
+template <const int MBlock, const int NBlock, typename TIn, typename TOut>
+void BatchedBlockedGemm<MBlock, NBlock, TIn, TOut>::run(
+  const unsigned int start, const unsigned int stop
+)
+{
+  // Perform the specified GEMMs
+  for (unsigned int i = start; i < stop; i++)
+  {
+    // Get pointers to the relevant matrices
+    const TIn* const mtr_a = a_ptr + i*a_matrix_stride;
+    const TIn* const mtr_b = b_ptr + i*b_matrix_stride;
+    TOut* const mtr_c = c_ptr + i*c_matrix_stride;
+
+    // Perform the GEMM
+    BlockedGemm<MBlock, NBlock, TIn, TOut>(
+      mtr_a, mtr_b, mtr_c, M, K, N,
+      a_row_stride, b_row_stride, c_row_stride
+    );
+  }
+}
+
+template class winograd::BatchedBlockedGemm<4, 16, float, float>;
+

diff --git a/src/core/NEON/kernels/winograd/gemm.hpp b/src/core/NEON/kernels/winograd/gemm.hpp
deleted file mode 100644
index 111e196..0000000
--- a/src/core/NEON/kernels/winograd/gemm.hpp
+++ /dev/null

@@ -1,127 +0,0 @@
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include "utils.hpp"
-
-template <typename TIn, typename TOut>
-void Gemm(const TIn* const a, const TIn* const b, TOut *c,
-          const int M, const int K, const int N,
-          const int a_row_stride,
-          const int b_row_stride,
-          const int c_row_stride,
-          const bool a_transposed=false,
-          const bool b_transposed=false) {
-  // Array access methods
-  const auto A = [a, a_transposed, M, K, a_row_stride] (const int i, const int j) -> TIn {
-    return a[(!a_transposed) ? i*a_row_stride + j : i + j*M];
-  };
-
-  const auto B = [b, b_transposed, K, N, b_row_stride] (const int i, const int j) -> TIn {
-    return b[(!b_transposed) ? i*b_row_stride + j : i + j*N];
-  };
-
-  const auto C = [c, c_row_stride] (const int i, const int j) -> TOut& {
-    return c[i*c_row_stride + j];
-  };
-
-  // Perform the matrix multiplication
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      for (int k = 0; k < K; k++) {
-        C(i, j) += A(i, k) * B(k, j);
-      }
-    }
-  }
-}
-
-template <const int M_BLOCK, const int N_BLOCK, typename TIn, typename TOut>
-void BlockedGemm(
-  const TIn* const a, const TIn* const b, TOut *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  // Array access methods
-  const auto A = [a, a_row_stride] (const int i, const int j) -> TIn {
-    return a[i*a_row_stride + j];
-  };
-
-  const auto B = [b, b_row_stride] (const int i, const int j) -> TIn {
-    return b[i*b_row_stride + j];
-  };
-
-  const auto C = [c, c_row_stride] (const int i, const int j) -> TOut& {
-    return c[i*c_row_stride + j];
-  };
-
-  const int M_BLOCKS = iceildiv(M, M_BLOCK);
-  const int N_BLOCKS = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < M_BLOCKS; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < N_BLOCKS; nblock++) {
-      // Create an appropriately sized block of accumulators
-      TOut accum[M_BLOCK][N_BLOCK];
-      for (int i = 0; i < M_BLOCK; i++) {
-        for (int j = 0; j < N_BLOCK; j++) {
-          accum[i][j] = static_cast<TOut>(0);
-        }
-      }
-
-      // Perform this portion of the matrix multiply
-      for (int k = 0; k < K; k++) {
-        // Load elements of A
-        TIn elems_a[M_BLOCK];
-        for (int i = 0; i < M_BLOCK; i++) {
-          elems_a[i] = A(mblock*M_BLOCK + i, k);
-        }
-
-        // Load elements of B
-        TIn elems_b[N_BLOCK];
-        for (int j = 0; j < N_BLOCK; j++) {
-          elems_b[j] = B(k, nblock*N_BLOCK + j);
-        }
-
-        // Perform the partial matrix multiply
-        for (int i = 0; i < M_BLOCK; i++) {
-          for (int j = 0; j < N_BLOCK; j++) {
-            accum[i][j] += elems_a[i] * elems_b[j];
-          }
-        }
-      }
-
-      // Store the partial product
-      for (int i = 0; i < M_BLOCK; i++) {
-        for (int j = 0; j < N_BLOCK; j++) {
-          C(mblock*M_BLOCK + i, nblock*N_BLOCK + j) = accum[i][j];
-        }
-      }
-    }
-  }
-}
-
-#include "gemm/a64_sgemm.hpp"

diff --git a/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp b/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp
deleted file mode 100644
index e1b7488..0000000
--- a/src/core/NEON/kernels/winograd/gemm/a64_sgemm.hpp
+++ /dev/null

@@ -1,355 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <cassert>
-#include "../utils.hpp"
-
-#ifdef __aarch64__
-
-template <>
-inline void BlockedGemm<8, 12, float, float>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  const int M_BLOCK = 8;
-  const int N_BLOCK = 12;
-
-  const int m_blocks = iceildiv(M, M_BLOCK);
-  const int n_blocks = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < m_blocks; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < n_blocks; nblock++) {
-      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
-      const float *bptr = b + nblock*N_BLOCK;
-      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
-      int k = K;
-
-      asm volatile (
-          // Create an 8x12 block of accumulators
-          " A_1 .req v27\n"
-          "sA_1 .req s27\n"
-          " A_2 .req v28\n"
-          "sA_2 .req s28\n"
-          " A_3 .req v29\n"
-          "sA_3 .req s29\n"
-          " A_4 .req v30\n"
-          "sA_4 .req s30\n"
-
-          " B_1 .req v24\n" " B_2 .req v25\n" " B_3 .req v26\n"
-          "qB_1 .req q24\n" "qB_2 .req q25\n" "qB_3 .req q26\n"
-
-          " C_11 .req  v0\n" " C_12 .req  v1\n" " C_13 .req  v2\n"
-          " C_21 .req  v3\n" " C_22 .req  v4\n" " C_23 .req  v5\n"
-          " C_31 .req  v6\n" " C_32 .req  v7\n" " C_33 .req  v8\n"
-          " C_41 .req  v9\n" " C_42 .req v10\n" " C_43 .req v11\n"
-          " C_51 .req v12\n" " C_52 .req v13\n" " C_53 .req v14\n"
-          " C_61 .req v15\n" " C_62 .req v16\n" " C_63 .req v17\n"
-          " C_71 .req v18\n" " C_72 .req v19\n" " C_73 .req v20\n"
-          " C_81 .req v21\n" " C_82 .req v22\n" " C_83 .req v23\n"
-
-          "qC_11 .req  q0\n" "qC_12 .req  q1\n" "qC_13 .req  q2\n"
-          "qC_21 .req  q3\n" "qC_22 .req  q4\n" "qC_23 .req  q5\n"
-          "qC_31 .req  q6\n" "qC_32 .req  q7\n" "qC_33 .req  q8\n"
-          "qC_41 .req  q9\n" "qC_42 .req q10\n" "qC_43 .req q11\n"
-          "qC_51 .req q12\n" "qC_52 .req q13\n" "qC_53 .req q14\n"
-          "qC_61 .req q15\n" "qC_62 .req q16\n" "qC_63 .req q17\n"
-          "qC_71 .req q18\n" "qC_72 .req q19\n" "qC_73 .req q20\n"
-          "qC_81 .req q21\n" "qC_82 .req q22\n" "qC_83 .req q23\n"
-
-          "aptr1 .req x17\n"
-          "aptr2 .req x18\n"
-          "aptr3 .req x19\n"
-          "aptr4 .req x20\n"
-          "aptr5 .req x21\n"
-          "aptr6 .req x22\n"
-          "aptr7 .req x23\n"
-
-          // Initialise accumulators with 0
-          // Initialise pointers
-          "movi C_11.4s, #0\n"
-          "add aptr1, %x[aptr], %x[a_row_stride]\n"
-          "movi C_12.4s, #0\n"
-          "add aptr2,    aptr1, %x[a_row_stride]\n"
-          "movi C_13.4s, #0\n"
-          "add aptr3,    aptr2, %x[a_row_stride]\n"
-          "movi C_21.4s, #0\n"
-          "add aptr4,    aptr3, %x[a_row_stride]\n"
-          "movi C_22.4s, #0\n"
-          "add aptr5,    aptr4, %x[a_row_stride]\n"
-          "movi C_23.4s, #0\n"
-          "add aptr6,    aptr5, %x[a_row_stride]\n"
-          "movi C_31.4s, #0\n"
-          "add aptr7,    aptr6, %x[a_row_stride]\n"
-          "movi C_32.4s, #0\n"
-          "ldr qB_1, [%x[bptr]]\n"
-          "movi C_33.4s, #0\n"
-          "ldr qB_2, [%x[bptr], #0x10]\n"
-          "movi C_41.4s, #0\n"
-          "prfm pldl1keep, [%x[bptr], #0x00]\n"
-          "movi C_42.4s, #0\n"
-          "prfm pldl1keep, [%x[bptr], #0x10]\n"
-          "movi C_43.4s, #0\n"
-          "prfm pldl1keep, [%x[bptr], #0x20]\n"
-          "movi C_51.4s, #0\n"
-          "prfm pldl1keep, [%x[aptr], #0x00]\n"
-          "movi C_52.4s, #0\n"
-          "prfm pldl1keep, [   aptr1, #0x00]\n"
-          "movi C_53.4s, #0\n"
-          "prfm pldl1keep, [   aptr2, #0x00]\n"
-          "movi C_61.4s, #0\n"
-          "prfm pldl1keep, [   aptr3, #0x00]\n"
-          "movi C_62.4s, #0\n"
-          "prfm pldl1keep, [   aptr4, #0x00]\n"
-          "movi C_63.4s, #0\n"
-          "prfm pldl1keep, [   aptr5, #0x00]\n"
-          "movi C_71.4s, #0\n"
-          "prfm pldl1keep, [   aptr6, #0x00]\n"
-          "movi C_72.4s, #0\n"
-          "prfm pldl1keep, [   aptr7, #0x00]\n"
-          "movi C_73.4s, #0\n"
-          "ldr sA_1, [%x[aptr]], #0x4\n"
-          "movi C_81.4s, #0\n"
-          "ldr sA_2, [   aptr1], #0x4\n"
-          "movi C_82.4s, #0\n"
-          "ldr sA_3, [   aptr2], #0x4\n"
-          "movi C_83.4s, #0\n"
-          "subs %x[k], %x[k], #1\n"
-          "beq 2f\n"
-
-          "1:"
-            "fmla C_11.4s, B_1.4s, A_1.s[0]\n"
-            "ldr qB_3, [%x[bptr], #0x20]\n"
-            "fmla C_12.4s, B_2.4s, A_1.s[0]\n"
-            "ldr sA_4, [   aptr3], #0x4\n"
-            "fmla C_13.4s, B_3.4s, A_1.s[0]\n"
-            "ldr sA_1, [   aptr4], #0x04\n"
-
-            "fmla C_21.4s, B_1.4s, A_2.s[0]\n"
-            "add %x[bptr], %x[bptr], %x[b_row_stride]\n"
-            "fmla C_22.4s, B_2.4s, A_2.s[0]\n"
-            "prfm pldl1keep, [   aptr3, #0x10]\n"
-            "fmla C_23.4s, B_3.4s, A_2.s[0]\n"
-            "ldr sA_2, [   aptr5], #0x04\n"
-
-            "fmla C_31.4s, B_1.4s, A_3.s[0]\n"
-            "prfm pldl1keep, [%x[bptr], #0x00]\n"
-            "fmla C_32.4s, B_2.4s, A_3.s[0]\n"
-            "prfm pldl1keep, [%x[bptr], #0x10]\n"
-            "fmla C_33.4s, B_3.4s, A_3.s[0]\n"
-            "ldr sA_3, [   aptr6], #0x04\n"
-
-            "fmla C_41.4s, B_1.4s, A_4.s[0]\n"
-            "prfm pldl1keep, [%x[bptr], #0x20]\n"
-            "fmla C_42.4s, B_2.4s, A_4.s[0]\n"
-            "prfm pldl1keep, [   aptr4, #0x10]\n"
-            "fmla C_43.4s, B_3.4s, A_4.s[0]\n"
-            "ldr sA_4, [   aptr7], #0x04\n"
-
-            "fmla C_51.4s, B_1.4s, A_1.s[0]\n"
-            "prfm pldl1keep, [   aptr5, #0x10]\n"
-            "fmla C_52.4s, B_2.4s, A_1.s[0]\n"
-            "prfm pldl1keep, [   aptr6, #0x10]\n"
-            "fmla C_53.4s, B_3.4s, A_1.s[0]\n"
-            "ldr sA_1, [%x[aptr]], #0x04\n"
-
-            "fmla C_61.4s, B_1.4s, A_2.s[0]\n"
-            "prfm pldl1keep, [   aptr7, #0x10]\n"
-            "fmla C_62.4s, B_2.4s, A_2.s[0]\n"
-            "subs %x[k], %x[k], #1\n"
-            "fmla C_63.4s, B_3.4s, A_2.s[0]\n"
-            "ldr sA_2, [   aptr1], #0x04\n"
-
-            "fmla C_71.4s, B_1.4s, A_3.s[0]\n"
-            "prfm pldl1keep, [%x[aptr], #0x10]\n"
-            "fmla C_72.4s, B_2.4s, A_3.s[0]\n"
-            "prfm pldl1keep, [   aptr1, #0x10]\n"
-            "fmla C_73.4s, B_3.4s, A_3.s[0]\n"
-            "ldr sA_3, [   aptr2], #0x04\n"
-
-            "fmla C_81.4s, B_1.4s, A_4.s[0]\n"
-            "prfm pldl1keep, [   aptr2, #0x10]\n"
-            "fmla C_82.4s, B_2.4s, A_4.s[0]\n"
-            "ldp qB_1, qB_2, [%x[bptr]]\n"
-            "fmla C_83.4s, B_3.4s, A_4.s[0]\n"
-            "bne 1b\n"
-
-          "2:"
-            "fmla C_11.4s, B_1.4s, A_1.s[0]\n"
-            "ldr qB_3, [%x[bptr], #0x20]\n"
-            "fmla C_12.4s, B_2.4s, A_1.s[0]\n"
-            "stp qC_11, qC_12, [%x[cptr]]\n"
-            "fmla C_13.4s, B_3.4s, A_1.s[0]\n"
-            "str qC_13, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-            "ldr sA_1, [   aptr4], #0x04\n"
-
-            "fmla C_21.4s, B_1.4s, A_2.s[0]\n"
-            "ldr sA_4, [   aptr3], #0x4\n"
-            "fmla C_22.4s, B_2.4s, A_2.s[0]\n"
-            "stp qC_21, qC_22, [%x[cptr]]\n"
-            "fmla C_23.4s, B_3.4s, A_2.s[0]\n"
-            "str qC_23, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-            "ldr sA_2, [   aptr5], #0x04\n"
-
-            "fmla C_31.4s, B_1.4s, A_3.s[0]\n"
-            "fmla C_32.4s, B_2.4s, A_3.s[0]\n"
-            "stp qC_31, qC_32, [%x[cptr]]\n"
-            "fmla C_33.4s, B_3.4s, A_3.s[0]\n"
-            "str qC_33, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-            "ldr sA_3, [   aptr6], #0x04\n"
-
-            "fmla C_41.4s, B_1.4s, A_4.s[0]\n"
-            "fmla C_42.4s, B_2.4s, A_4.s[0]\n"
-            "stp qC_41, qC_42, [%x[cptr]]\n"
-            "fmla C_43.4s, B_3.4s, A_4.s[0]\n"
-            "str qC_43, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-            "ldr sA_4, [   aptr7], #0x04\n"
-
-            "fmla C_51.4s, B_1.4s, A_1.s[0]\n"
-            "fmla C_52.4s, B_2.4s, A_1.s[0]\n"
-            "stp qC_51, qC_52, [%x[cptr]]\n"
-            "fmla C_53.4s, B_3.4s, A_1.s[0]\n"
-            "str qC_53, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-
-            "fmla C_61.4s, B_1.4s, A_2.s[0]\n"
-            "fmla C_62.4s, B_2.4s, A_2.s[0]\n"
-            "stp qC_61, qC_62, [%x[cptr]]\n"
-            "fmla C_63.4s, B_3.4s, A_2.s[0]\n"
-            "str qC_63, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-
-            "fmla C_71.4s, B_1.4s, A_3.s[0]\n"
-            "fmla C_72.4s, B_2.4s, A_3.s[0]\n"
-            "stp qC_71, qC_72, [%x[cptr]]\n"
-            "fmla C_73.4s, B_3.4s, A_3.s[0]\n"
-            "str qC_73, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-
-            "fmla C_81.4s, B_1.4s, A_4.s[0]\n"
-            "fmla C_82.4s, B_2.4s, A_4.s[0]\n"
-            "stp qC_81, qC_82, [%x[cptr]]\n"
-            "fmla C_83.4s, B_3.4s, A_4.s[0]\n"
-            "str qC_83, [%x[cptr], #0x20]\n"
-            "add %x[cptr], %x[cptr], %x[c_row_stride]\n"
-
-          // Clear aliases
-          ".unreq aptr1\n"
-          ".unreq aptr2\n"
-          ".unreq aptr3\n"
-          ".unreq aptr4\n"
-          ".unreq aptr5\n"
-          ".unreq aptr6\n"
-          ".unreq aptr7\n"
-
-          ".unreq  A_1\n" ".unreq  A_2\n" ".unreq  A_3\n" ".unreq  A_4\n"
-          ".unreq sA_1\n" ".unreq sA_2\n" ".unreq sA_3\n" ".unreq sA_4\n"
-
-          ".unreq  B_1\n" ".unreq  B_2\n" ".unreq  B_3\n"
-          ".unreq qB_1\n" ".unreq qB_2\n" ".unreq qB_3\n"
-
-          ".unreq C_11\n" ".unreq C_12\n" ".unreq C_13\n"
-          ".unreq C_21\n" ".unreq C_22\n" ".unreq C_23\n"
-          ".unreq C_31\n" ".unreq C_32\n" ".unreq C_33\n"
-          ".unreq C_41\n" ".unreq C_42\n" ".unreq C_43\n"
-          ".unreq C_51\n" ".unreq C_52\n" ".unreq C_53\n"
-          ".unreq C_61\n" ".unreq C_62\n" ".unreq C_63\n"
-          ".unreq C_71\n" ".unreq C_72\n" ".unreq C_73\n"
-          ".unreq C_81\n" ".unreq C_82\n" ".unreq C_83\n"
-
-          ".unreq qC_11\n" ".unreq qC_12\n" ".unreq qC_13\n"
-          ".unreq qC_21\n" ".unreq qC_22\n" ".unreq qC_23\n"
-          ".unreq qC_31\n" ".unreq qC_32\n" ".unreq qC_33\n"
-          ".unreq qC_41\n" ".unreq qC_42\n" ".unreq qC_43\n"
-          ".unreq qC_51\n" ".unreq qC_52\n" ".unreq qC_53\n"
-          ".unreq qC_61\n" ".unreq qC_62\n" ".unreq qC_63\n"
-          ".unreq qC_71\n" ".unreq qC_72\n" ".unreq qC_73\n"
-          ".unreq qC_81\n" ".unreq qC_82\n" ".unreq qC_83\n"
-          : [aptr] "+r" (aptr),
-            [bptr] "+r" (bptr),
-            [cptr] "+r" (cptr),
-            [k] "+r" (k)
-          : [a_row_stride] "r" (a_row_stride * sizeof(float)),
-            [b_row_stride] "r" (b_row_stride * sizeof(float)),
-            [c_row_stride] "r" (c_row_stride * sizeof(float))
-          : "cc", "memory",
-            "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-            "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-            "v29", "v30", "x17", "x18", "x19", "x20", "x21", "x22", "x23"
-      );
-    }
-  }
-}
-
-/*****************************************************************************/
-/* 4x16 blocked GEMM with specialised tails
- */
-#include "a64_sgemm_4x16.hpp"
-
-template <>
-inline void BlockedGemm<4, 16, float, float>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  // Despatch based on tail of K
-  switch (K % 4) {
-    case 3:
-      sgemm_4x16_impl<3>(
-        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
-      );
-      break;
-    case 2:
-      sgemm_4x16_impl<2>(
-        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
-      );
-      break;
-    case 1:
-      sgemm_4x16_impl<1>(
-        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
-      );
-      break;
-    case 0:
-      sgemm_4x16_impl<0>(
-        a, b, c, M, K, N, a_row_stride, b_row_stride, c_row_stride
-      );
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp b/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp
deleted file mode 100644
index e74610e..0000000
--- a/src/core/NEON/kernels/winograd/gemm/a64_sgemm_4x16.hpp
+++ /dev/null

@@ -1,1445 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-template <const unsigned int tail>
-inline void sgemm_4x16_impl(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-);
-
-template <>
-inline void sgemm_4x16_impl<0>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  const int TAIL_SIZE = 0;
-  const int M_BLOCK = 4;
-  const int N_BLOCK = 16;
-
-  const int m_blocks = iceildiv(M, M_BLOCK);
-  const int n_blocks = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < m_blocks; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < n_blocks; nblock++) {
-      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
-      const float *bptr = b + nblock*N_BLOCK;
-      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
-      int k = (K - TAIL_SIZE) / 4;
-
-      asm volatile(
-        "aptr2 .req X20\n"
-        "aptr3 .req X21\n"
-        "aptr4 .req X22\n"
-        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
-        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
-        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
-        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
-        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
-        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
-        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
-        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
-        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
-        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
-        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
-        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
-        "vB1 .req v20\n" "qB1 .req q20\n"
-        "vB2 .req v21\n" "qB2 .req q21\n"
-        "vB3 .req v22\n" "qB3 .req q22\n"
-        "vB4 .req v23\n" "qB4 .req q23\n"
-
-        // Clear accumulators, initialise pointers
-        "movi vC11.4s, #0\n"
-        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
-        "movi vC12.4s, #0\n"
-        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
-        "movi vC13.4s, #0\n"
-        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
-        "movi vC14.4s, #0\n"
-        "ldr qA1, [%x[aptr]], #0x10\n"
-        "movi vC21.4s, #0\n"
-        "ldr qA2, [   aptr2], #0x10\n"
-        "movi vC22.4s, #0\n"
-        "ldr qB1, [%x[bptr], #0x00]\n"
-        "movi vC23.4s, #0\n"
-        "ldr qB2, [%x[bptr], #0x10]\n"
-        "movi vC24.4s, #0\n"
-        "ldr qB3, [%x[bptr], #0x20]\n"
-        "movi vC31.4s, #0\n"
-        "movi vC32.4s, #0\n"
-        "movi vC33.4s, #0\n"
-        "movi vC34.4s, #0\n"
-        "movi vC41.4s, #0\n"
-        "movi vC42.4s, #0\n"
-        "movi vC43.4s, #0\n"
-        "movi vC44.4s, #0\n"
-        "subs %x[k], %x[k], #1\n"
-        "beq 2f\n"
-
-        "1:"  // Loop proper
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "subs %x[k], %x[k], #1\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-          "bne 1b\n"
-
-        "2:"  // Tail
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "stp qC11, qC12, [%x[cptr], #0x00]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "stp qC13, qC14, [%x[cptr], #0x20]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "stp qC21, qC22, [%x[cptr], #0x00]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "stp qC23, qC24, [%x[cptr], #0x20]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "stp qC31, qC32, [%x[cptr], #0x00]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "stp qC33, qC34, [%x[cptr], #0x20]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "stp qC41, qC42, [%x[cptr], #0x00]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-          "stp qC43, qC44, [%x[cptr], #0x20]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-
-        ".unreq vB4\n" ".unreq qB4\n"
-        ".unreq vB3\n" ".unreq qB3\n"
-        ".unreq vB2\n" ".unreq qB2\n"
-        ".unreq vB1\n" ".unreq qB1\n"
-        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
-        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
-        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
-        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
-        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
-        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
-        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
-        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
-        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
-        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
-        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
-        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
-        ".unreq aptr2\n"
-        ".unreq aptr3\n"
-        ".unreq aptr4\n"
-
-        : [aptr] "+r" (aptr),
-          [bptr] "+r" (bptr),
-          [cptr] "+r" (cptr),
-          [k] "+r" (k)
-        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
-          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
-          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
-        : "cc", "memory", "x20", "x21", "x22",
-          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-          "v21", "v22", "v23"
-      );
-    }
-  }
-}
-
-template <>
-inline void sgemm_4x16_impl<1>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  const int TAIL_SIZE = 1;
-  const int M_BLOCK = 4;
-  const int N_BLOCK = 16;
-
-  const int m_blocks = iceildiv(M, M_BLOCK);
-  const int n_blocks = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < m_blocks; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < n_blocks; nblock++) {
-      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
-      const float *bptr = b + nblock*N_BLOCK;
-      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
-      int k = (K - TAIL_SIZE) / 4;
-
-      asm volatile(
-        "aptr2 .req X20\n"
-        "aptr3 .req X21\n"
-        "aptr4 .req X22\n"
-        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
-        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
-        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
-        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
-        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
-        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
-        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
-        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
-        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
-        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
-        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
-        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
-        "vB1 .req v20\n" "qB1 .req q20\n"
-        "vB2 .req v21\n" "qB2 .req q21\n"
-        "vB3 .req v22\n" "qB3 .req q22\n"
-        "vB4 .req v23\n" "qB4 .req q23\n"
-
-        // Clear accumulators, initialise pointers
-        "movi vC11.4s, #0\n"
-        "ldr qB1, [%x[bptr], #0x00]\n"
-        "movi vC12.4s, #0\n"
-        "ldr qB2, [%x[bptr], #0x10]\n"
-        "movi vC13.4s, #0\n"
-        "ldr qB3, [%x[bptr], #0x20]\n"
-        "movi vC14.4s, #0\n"
-        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
-        "movi vC21.4s, #0\n"
-        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
-        "movi vC22.4s, #0\n"
-        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
-        "movi vC23.4s, #0\n"
-        "cbnz %x[k], 3f\n"
-
-        // Prepare for tail in K
-        "movi vC24.4s, #0\n"
-        "ldr sA1, [%x[aptr]], #0x04\n"
-        "movi vC31.4s, #0\n"
-        "ldr sA2, [   aptr2], #0x04\n"
-        "movi vC32.4s, #0\n"
-        "movi vC33.4s, #0\n"
-        "movi vC34.4s, #0\n"
-        "movi vC41.4s, #0\n"
-        "movi vC42.4s, #0\n"
-        "movi vC43.4s, #0\n"
-        "movi vC44.4s, #0\n"
-        "b 2f\n"  // Jump to tail
-
-        "3:"  // Prepare for loop over K
-          "movi vC24.4s, #0\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "movi vC31.4s, #0\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "movi vC32.4s, #0\n"
-          "movi vC33.4s, #0\n"
-          "movi vC34.4s, #0\n"
-          "movi vC41.4s, #0\n"
-          "movi vC42.4s, #0\n"
-          "movi vC43.4s, #0\n"
-          "movi vC44.4s, #0\n"
-          "subs %x[k], %x[k], #1\n"
-          "beq 4f\n"
-
-        "1:"  // Loop proper
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "subs %x[k], %x[k], #1\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-          "bne 1b\n"
-
-        "4:"  // Tail iteration
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr sA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr sA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-
-        "2:"  // Common tail
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "stp qC11, qC12, [%x[cptr], #0x00]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "ldr sA3, [   aptr3], #0x10\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "stp qC13, qC14, [%x[cptr], #0x20]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "stp qC21, qC22, [%x[cptr], #0x00]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "ldr sA4, [   aptr4], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "stp qC23, qC24, [%x[cptr], #0x20]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "stp qC31, qC32, [%x[cptr], #0x00]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "stp qC33, qC34, [%x[cptr], #0x20]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "stp qC41, qC42, [%x[cptr], #0x00]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-          "stp qC43, qC44, [%x[cptr], #0x20]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-
-        ".unreq vB4\n" ".unreq qB4\n"
-        ".unreq vB3\n" ".unreq qB3\n"
-        ".unreq vB2\n" ".unreq qB2\n"
-        ".unreq vB1\n" ".unreq qB1\n"
-        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
-        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
-        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
-        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
-        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
-        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
-        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
-        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
-        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
-        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
-        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
-        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
-        ".unreq aptr2\n"
-        ".unreq aptr3\n"
-        ".unreq aptr4\n"
-
-        : [aptr] "+r" (aptr),
-          [bptr] "+r" (bptr),
-          [cptr] "+r" (cptr),
-          [k] "+r" (k)
-        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
-          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
-          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
-        : "cc", "memory", "x20", "x21", "x22",
-          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-          "v21", "v22", "v23"
-      );
-    }
-  }
-}
-
-template <>
-inline void sgemm_4x16_impl<2>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  const int TAIL_SIZE = 2;
-  const int M_BLOCK = 4;
-  const int N_BLOCK = 16;
-
-  const int m_blocks = iceildiv(M, M_BLOCK);
-  const int n_blocks = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < m_blocks; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < n_blocks; nblock++) {
-      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
-      const float *bptr = b + nblock*N_BLOCK;
-      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
-      int k = (K - TAIL_SIZE) / 4;
-
-      asm volatile(
-        "aptr2 .req X20\n"
-        "aptr3 .req X21\n"
-        "aptr4 .req X22\n"
-        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
-        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
-        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
-        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
-        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
-        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
-        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
-        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
-        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
-        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
-        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
-        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
-        "vB1 .req v20\n" "qB1 .req q20\n"
-        "vB2 .req v21\n" "qB2 .req q21\n"
-        "vB3 .req v22\n" "qB3 .req q22\n"
-        "vB4 .req v23\n" "qB4 .req q23\n"
-
-        // Clear accumulators, initialise pointers
-        "movi vC11.4s, #0\n"
-        "ldr qB1, [%x[bptr], #0x00]\n"
-        "movi vC12.4s, #0\n"
-        "ldr qB2, [%x[bptr], #0x10]\n"
-        "movi vC13.4s, #0\n"
-        "ldr qB3, [%x[bptr], #0x20]\n"
-        "movi vC14.4s, #0\n"
-        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
-        "movi vC21.4s, #0\n"
-        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
-        "movi vC22.4s, #0\n"
-        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
-        "movi vC23.4s, #0\n"
-        "cbnz %x[k], 3f\n"
-
-        // Prepare for tail in K
-        "movi vC24.4s, #0\n"
-        "ldr dA1, [%x[aptr]], #0x08\n"
-        "movi vC31.4s, #0\n"
-        "ldr dA2, [   aptr2], #0x08\n"
-        "movi vC32.4s, #0\n"
-        "movi vC33.4s, #0\n"
-        "movi vC34.4s, #0\n"
-        "movi vC41.4s, #0\n"
-        "movi vC42.4s, #0\n"
-        "movi vC43.4s, #0\n"
-        "movi vC44.4s, #0\n"
-        "b 2f\n"  // Jump to tail
-
-        "3:"  // Prepare for loop over K
-          "movi vC24.4s, #0\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "movi vC31.4s, #0\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "movi vC32.4s, #0\n"
-          "movi vC33.4s, #0\n"
-          "movi vC34.4s, #0\n"
-          "movi vC41.4s, #0\n"
-          "movi vC42.4s, #0\n"
-          "movi vC43.4s, #0\n"
-          "movi vC44.4s, #0\n"
-          "subs %x[k], %x[k], #1\n"
-          "beq 4f\n"
-
-        "1:"  // Loop proper
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "subs %x[k], %x[k], #1\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-          "bne 1b\n"
-
-        "4:"  // Tail iteration
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr dA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr dA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-
-        "2:"  // Common tail
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr dA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr dA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "stp qC11, qC12, [%x[cptr], #0x00]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "stp qC13, qC14, [%x[cptr], #0x20]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "stp qC21, qC22, [%x[cptr], #0x00]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "stp qC23, qC24, [%x[cptr], #0x20]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "stp qC31, qC32, [%x[cptr], #0x00]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "stp qC33, qC34, [%x[cptr], #0x20]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "stp qC41, qC42, [%x[cptr], #0x00]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-          "stp qC43, qC44, [%x[cptr], #0x20]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-
-        ".unreq vB4\n" ".unreq qB4\n"
-        ".unreq vB3\n" ".unreq qB3\n"
-        ".unreq vB2\n" ".unreq qB2\n"
-        ".unreq vB1\n" ".unreq qB1\n"
-        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
-        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
-        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
-        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
-        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
-        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
-        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
-        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
-        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
-        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
-        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
-        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
-        ".unreq aptr2\n"
-        ".unreq aptr3\n"
-        ".unreq aptr4\n"
-
-        : [aptr] "+r" (aptr),
-          [bptr] "+r" (bptr),
-          [cptr] "+r" (cptr),
-          [k] "+r" (k)
-        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
-          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
-          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
-        : "cc", "memory", "x20", "x21", "x22",
-          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-          "v21", "v22", "v23"
-      );
-    }
-  }
-}
-
-template <>
-inline void sgemm_4x16_impl<3>(
-  const float* const a, const float* const b, float *c,
-  const int M, const int K, const int N,
-  const int a_row_stride,
-  const int b_row_stride,
-  const int c_row_stride
-) {
-  const int TAIL_SIZE = 3;
-  const int M_BLOCK = 4;
-  const int N_BLOCK = 16;
-
-  const int m_blocks = iceildiv(M, M_BLOCK);
-  const int n_blocks = iceildiv(N, N_BLOCK);
-
-  // For each block of output rows
-  for (int mblock = 0; mblock < m_blocks; mblock++) {
-    // For each block of output columns
-    for (int nblock = 0; nblock < n_blocks; nblock++) {
-      const float *aptr = a + mblock*M_BLOCK*a_row_stride;
-      const float *bptr = b + nblock*N_BLOCK;
-      float *cptr = c + mblock*M_BLOCK*c_row_stride + nblock*N_BLOCK;
-      int k = (K - TAIL_SIZE) / 4;
-
-      asm volatile(
-        "aptr2 .req X20\n"
-        "aptr3 .req X21\n"
-        "aptr4 .req X22\n"
-        "vC11 .req  v0\n" "vC12 .req  v1\n" "vC13 .req  v2\n" "vC14 .req  v3\n"
-        "qC11 .req  q0\n" "qC12 .req  q1\n" "qC13 .req  q2\n" "qC14 .req  q3\n"
-        "vC21 .req  v4\n" "vC22 .req  v5\n" "vC23 .req  v6\n" "vC24 .req  v7\n"
-        "qC21 .req  q4\n" "qC22 .req  q5\n" "qC23 .req  q6\n" "qC24 .req  q7\n"
-        "vC31 .req  v8\n" "vC32 .req  v9\n" "vC33 .req v10\n" "vC34 .req v11\n"
-        "qC31 .req  q8\n" "qC32 .req  q9\n" "qC33 .req q10\n" "qC34 .req q11\n"
-        "vC41 .req v12\n" "vC42 .req v13\n" "vC43 .req v14\n" "vC44 .req v15\n"
-        "qC41 .req q12\n" "qC42 .req q13\n" "qC43 .req q14\n" "qC44 .req q15\n"
-        "vA1 .req v16\n" "qA1 .req q16\n" "dA1 .req d16\n" "sA1 .req s16\n"
-        "vA2 .req v17\n" "qA2 .req q17\n" "dA2 .req d17\n" "sA2 .req s17\n"
-        "vA3 .req v18\n" "qA3 .req q18\n" "dA3 .req d18\n" "sA3 .req s18\n"
-        "vA4 .req v19\n" "qA4 .req q19\n" "dA4 .req d19\n" "sA4 .req s19\n"
-        "vB1 .req v20\n" "qB1 .req q20\n"
-        "vB2 .req v21\n" "qB2 .req q21\n"
-        "vB3 .req v22\n" "qB3 .req q22\n"
-        "vB4 .req v23\n" "qB4 .req q23\n"
-
-        // Clear accumulators, initialise pointers
-        "movi vC11.4s, #0\n"
-        "ldr qB1, [%x[bptr], #0x00]\n"
-        "movi vC12.4s, #0\n"
-        "ldr qB2, [%x[bptr], #0x10]\n"
-        "movi vC13.4s, #0\n"
-        "ldr qB3, [%x[bptr], #0x20]\n"
-        "movi vC14.4s, #0\n"
-        "add aptr2, %x[aptr], %x[a_row_stride_bytes]\n"
-        "movi vC21.4s, #0\n"
-        "add aptr3,    aptr2, %x[a_row_stride_bytes]\n"
-        "movi vC22.4s, #0\n"
-        "add aptr4,    aptr3, %x[a_row_stride_bytes]\n"
-        "movi vC23.4s, #0\n"
-        "cbnz %x[k], 3f\n"
-
-        // Prepare for tail in K
-        "movi vC24.4s, #0\n"
-        "ldr dA1, [%x[aptr]], #0x08\n"
-        "movi vC31.4s, #0\n"
-        "ldr dA2, [   aptr2], #0x08\n"
-        "movi vC32.4s, #0\n"
-        "movi vC33.4s, #0\n"
-        "movi vC34.4s, #0\n"
-        "movi vC41.4s, #0\n"
-        "movi vC42.4s, #0\n"
-        "movi vC43.4s, #0\n"
-        "movi vC44.4s, #0\n"
-        "b 2f\n"  // Jump to tail
-
-        "3:"  // Prepare for loop over K
-          "movi vC24.4s, #0\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "movi vC31.4s, #0\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "movi vC32.4s, #0\n"
-          "movi vC33.4s, #0\n"
-          "movi vC34.4s, #0\n"
-          "movi vC41.4s, #0\n"
-          "movi vC42.4s, #0\n"
-          "movi vC43.4s, #0\n"
-          "movi vC44.4s, #0\n"
-          "subs %x[k], %x[k], #1\n"
-          "beq 4f\n"
-
-        "1:"  // Loop proper
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "subs %x[k], %x[k], #1\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr qA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr qA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-          "bne 1b\n"
-
-        "4:"  // Tail iteration
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr qA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[2]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[2]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[2]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[2]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[2]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[2]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[2]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[2]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[2]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[2]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[2]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[2]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[2]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[2]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[2]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[2]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[3]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[3]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[3]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[3]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[3]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[3]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[3]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[3]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[3]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[3]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[3]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[3]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[3]\n"
-          "ldr dA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[3]\n"
-          "ldr dA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[3]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[3]\n"
-
-        "2:"  // Common tail
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr dA3, [   aptr3], #0x10\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "ldr dA4, [   aptr4], #0x10\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[1]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[1]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[1]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[1]\n"
-          "add %x[bptr], %x[bptr], %x[b_row_stride_bytes]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[1]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[1]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[1]\n"
-          "ldr qB1, [%x[bptr], #0x00]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[1]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[1]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[1]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[1]\n"
-          "ldr qB2, [%x[bptr], #0x10]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[1]\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[1]\n"
-          "ldr sA1, [%x[aptr]], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[1]\n"
-          "ldr sA2, [   aptr2], #0x10\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[1]\n"
-          "ldr qB3, [%x[bptr], #0x20]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[1]\n"
-
-          "fmla vC11.4s, vB1.4s, vA1.s[0]\n"
-          "ldr qB4, [%x[bptr], #0x30]\n"
-          "fmla vC12.4s, vB2.4s, vA1.s[0]\n"
-          "stp qC11, qC12, [%x[cptr], #0x00]\n"
-          "fmla vC13.4s, vB3.4s, vA1.s[0]\n"
-          "ldr sA3, [   aptr3], #0x10\n"
-          "fmla vC14.4s, vB4.4s, vA1.s[0]\n"
-          "stp qC13, qC14, [%x[cptr], #0x20]\n"
-          "fmla vC21.4s, vB1.4s, vA2.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC22.4s, vB2.4s, vA2.s[0]\n"
-          "stp qC21, qC22, [%x[cptr], #0x00]\n"
-          "fmla vC23.4s, vB3.4s, vA2.s[0]\n"
-          "ldr sA4, [   aptr4], #0x10\n"
-          "fmla vC24.4s, vB4.4s, vA2.s[0]\n"
-          "stp qC23, qC24, [%x[cptr], #0x20]\n"
-          "fmla vC31.4s, vB1.4s, vA3.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC32.4s, vB2.4s, vA3.s[0]\n"
-          "stp qC31, qC32, [%x[cptr], #0x00]\n"
-          "fmla vC33.4s, vB3.4s, vA3.s[0]\n"
-          "fmla vC34.4s, vB4.4s, vA3.s[0]\n"
-          "stp qC33, qC34, [%x[cptr], #0x20]\n"
-          "fmla vC41.4s, vB1.4s, vA4.s[0]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-          "fmla vC42.4s, vB2.4s, vA4.s[0]\n"
-          "stp qC41, qC42, [%x[cptr], #0x00]\n"
-          "fmla vC43.4s, vB3.4s, vA4.s[0]\n"
-          "fmla vC44.4s, vB4.4s, vA4.s[0]\n"
-          "stp qC43, qC44, [%x[cptr], #0x20]\n"
-          "add %x[cptr], %x[cptr], %x[c_row_stride_bytes]\n"
-
-        ".unreq vB4\n" ".unreq qB4\n"
-        ".unreq vB3\n" ".unreq qB3\n"
-        ".unreq vB2\n" ".unreq qB2\n"
-        ".unreq vB1\n" ".unreq qB1\n"
-        ".unreq vA4\n" ".unreq qA4\n" ".unreq dA4\n" ".unreq sA4\n"
-        ".unreq vA3\n" ".unreq qA3\n" ".unreq dA3\n" ".unreq sA3\n"
-        ".unreq vA2\n" ".unreq qA2\n" ".unreq dA2\n" ".unreq sA2\n"
-        ".unreq vA1\n" ".unreq qA1\n" ".unreq dA1\n" ".unreq sA1\n"
-        ".unreq qC41\n" ".unreq qC42\n" ".unreq qC43\n" ".unreq qC44\n"
-        ".unreq vC41\n" ".unreq vC42\n" ".unreq vC43\n" ".unreq vC44\n"
-        ".unreq qC31\n" ".unreq qC32\n" ".unreq qC33\n" ".unreq qC34\n"
-        ".unreq vC31\n" ".unreq vC32\n" ".unreq vC33\n" ".unreq vC34\n"
-        ".unreq qC21\n" ".unreq qC22\n" ".unreq qC23\n" ".unreq qC24\n"
-        ".unreq vC21\n" ".unreq vC22\n" ".unreq vC23\n" ".unreq vC24\n"
-        ".unreq qC11\n" ".unreq qC12\n" ".unreq qC13\n" ".unreq qC14\n"
-        ".unreq vC11\n" ".unreq vC12\n" ".unreq vC13\n" ".unreq vC14\n"
-        ".unreq aptr2\n"
-        ".unreq aptr3\n"
-        ".unreq aptr4\n"
-
-        : [aptr] "+r" (aptr),
-          [bptr] "+r" (bptr),
-          [cptr] "+r" (cptr),
-          [k] "+r" (k)
-        : [a_row_stride_bytes] "r" (a_row_stride * sizeof(float)),
-          [b_row_stride_bytes] "r" (b_row_stride * sizeof(float)),
-          [c_row_stride_bytes] "r" (c_row_stride * sizeof(float))
-        : "cc", "memory", "x20", "x21", "x22",
-          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-          "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
-          "v21", "v22", "v23"
-      );
-    }
-  }
-}

diff --git a/src/core/NEON/kernels/winograd/perf.h b/src/core/NEON/kernels/winograd/perf.h
deleted file mode 100644
index 11fb0c4..0000000
--- a/src/core/NEON/kernels/winograd/perf.h
+++ /dev/null

@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* Prototypes from perf.c */
-
-void start_counter(int fd);
-long long get_counter(int fd);
-long long stop_counter(int fd);
-int open_instruction_counter(void);
-int open_cycle_counter(void);

diff --git a/src/core/NEON/kernels/winograd/profiler.hpp b/src/core/NEON/kernels/winograd/profiler.hpp
deleted file mode 100644
index 143192b..0000000
--- a/src/core/NEON/kernels/winograd/profiler.hpp
+++ /dev/null

@@ -1,244 +0,0 @@
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <cstdio>
-#include <map>
-#include <vector>
-
-#include "perf.h"
-#include <unistd.h>
-
-class profiler {
-private:
-#ifdef CYCLE_PROFILING
-    struct ProfileEntry {
-      int event_id;
-      long int bytes_read, ops, bytes_written;
-      long int duration;
-    };
-
-    static const int maxevents = 10000;
-    ProfileEntry events[maxevents];
-    int currentevent;
-    int countfd;
-
-    std::map<const char *, int> event_ids;
-
-    int get_event_id(const char *id) {
-      if (!event_ids.count(id)) {
-        event_ids.emplace(id, event_ids.size());
-      }
-      return event_ids[id];
-    }
-#endif  // CYCLE_PROFILING
-
-public:
-#ifdef CYCLE_PROFILING
-    profiler() {
-        currentevent = 0;
-        countfd = open_cycle_counter();
-    }
-
-    ~profiler() {
-        close(countfd);
-
-        // Compute performance from recorded events
-        struct ProfileResult {
-          ProfileResult() : total_calls(0),
-                            total_duration(0),
-                            total_bytes_read(0),
-                            total_ops(0),
-                            total_bytes_written(0) {
-          }
-
-          void operator+=(const ProfileEntry &rhs) {
-            total_calls++;
-            total_duration += rhs.duration;
-            total_bytes_read += rhs.bytes_read;
-            total_ops += rhs.ops;
-            total_bytes_written = rhs.bytes_written;
-          }
-
-          float avg_duration(void) const {
-            return static_cast<float>(total_duration) /
-                   static_cast<float>(total_calls);
-          }
-
-          float bytes_read_per_cycle(void) const {
-            return static_cast<float>(total_bytes_read) /
-                   static_cast<float>(total_duration);
-          }
-
-          float ops_per_cycle(void) const {
-            return static_cast<float>(total_ops) /
-                   static_cast<float>(total_duration);
-          }
-
-          float bytes_written_per_cycle(void) const {
-            return static_cast<float>(total_bytes_written) /
-                   static_cast<float>(total_duration);
-          }
-
-          long int total_calls,
-                   total_duration,
-                   total_bytes_read,
-                   total_ops,
-                   total_bytes_written;
-        };
-
-        std::vector<ProfileResult> totals;
-        totals.resize(event_ids.size());
-        for (int i = 0; i < currentevent; i++) {
-          const auto &event = events[i];
-          totals[event.event_id] += event;
-        }
-
-        // Get the longest label
-        int len_label = 0;
-        for (const auto &kv : event_ids) {
-          len_label = std::max(len_label, static_cast<int>(strlen(kv.first)));
-        }
-
-        // Get the longest values for every other field
-        const auto get_length_of_field =
-          [totals] (const char *title, auto f, auto len) -> size_t {
-            size_t l = strlen(title);
-            for (const auto &v : totals) {
-              l = std::max(l, len(f(v)));
-            }
-            return l;
-        };
-
-        // Get the strlen for an int
-        const auto intlen = [] (long int x) -> size_t {
-          size_t len = 0;
-          do {
-            x /= 10;
-            len++;
-          } while (x);
-          return len;
-        };
-
-        // Get the strlen for a float
-        const auto floatlen = [] (const int precision) {
-          return [precision] (float x) {
-            size_t len = 0;
-
-            if (!std::isfinite(x)) {
-              return static_cast<size_t>(3);
-            }
-
-            do {
-              x /= 10.0f;
-              len++;
-            } while (x > 1.0f);
-            return len + 1 + precision;
-          };
-        };
-
-        const int len_calls = get_length_of_field(
-            "Calls", [] (const auto &v) {return v.total_calls;},
-            intlen
-        );
-        const int len_duration = get_length_of_field(
-            "Duration", [] (const auto &v) {return v.total_duration;},
-            intlen
-        );
-        const int len_average_duration = get_length_of_field(
-            "Average", [] (const auto &v) {return v.avg_duration();},
-            floatlen(2)
-        );
-        const int len_reads_per_cycle = get_length_of_field(
-            "Reads / cycle",
-            [] (const auto &v) {return v.bytes_read_per_cycle();},
-            floatlen(6)
-        );
-        const int len_ops_per_cycle = get_length_of_field(
-            "Ops / cycle",
-            [] (const auto &v) {return v.ops_per_cycle();},
-            floatlen(6)
-        );
-        const int len_writes_per_cycle = get_length_of_field(
-            "Writes / cycle",
-            [] (const auto &v) {return v.bytes_written_per_cycle();},
-            floatlen(6)
-        );
-
-        // Print header
-        printf(
-          "%*s    %*s    %*s    %*s    %*s    %*s    %*s\n",
-          len_label, "",
-          len_calls, "Calls",
-          len_duration, "Duration",
-          len_average_duration, "Average",
-          len_reads_per_cycle, "Reads / cycle",
-          len_ops_per_cycle, "Ops / cycle",
-          len_writes_per_cycle, "Writes / cycle"
-        );
-        for (const auto &kv : event_ids) {
-          const auto id = kv.second;
-          printf(
-            "%*s    %*ld    %*ld    %*.2f    %*.6f    %*.6f    %*.6f\n",
-            len_label, kv.first,
-            len_calls, totals[id].total_calls,
-            len_duration, totals[id].total_duration,
-            len_average_duration, totals[id].avg_duration(),
-            len_reads_per_cycle, totals[id].bytes_read_per_cycle(),
-            len_ops_per_cycle, totals[id].ops_per_cycle(),
-            len_writes_per_cycle, totals[id].bytes_written_per_cycle()
-          );
-        }
-        printf("\n");
-    }
-#endif  // CYCLE_PROFILING
-
-    template <typename T>
-    void operator() (const char * event,
-                     T func,
-                     long int bytes_read = 0,
-                     long int ops = 0,
-                     long int bytes_written = 0) {
-#ifdef CYCLE_PROFILING
-        if (currentevent==maxevents) {
-            func();
-        } else {
-            start_counter(countfd);
-            func();
-            long long cycs = stop_counter(countfd);
-
-            // Store the profiling data
-            events[currentevent++] = {
-              get_event_id(event), bytes_read, ops, bytes_written, cycs
-            };
-        }
-#else
-      func();
-#endif  // CYCLE_PROFILING
-    }
-};

diff --git a/src/core/NEON/kernels/winograd/shims.hpp b/src/core/NEON/kernels/winograd/shims.hpp
deleted file mode 100644
index 249e575..0000000
--- a/src/core/NEON/kernels/winograd/shims.hpp
+++ /dev/null

@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-/** Re-order a weight tensor from [Output feature map x Input feature map x
- *  Height x Width] format to [Height x Width x Input feature map x Output
- *  feature map] format.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride=0,
-  int in_input_feature_map_stride=0,
-  int in_row_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0,
-  int out_input_feature_map_stride=0
-);
-
-/** Re-order a weight tensor from [Height x Width x Input feature map x Output
- *  feature map] format to [Output feature map x Input feature map x Height x
- *  Width] format.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int in_input_feature_map_stride=0,
-  int out_output_feature_map_stride=0,
-  int out_input_feature_map_stride=0,
-  int out_row_stride=0
-);
-
-
-/* Re-order a tensor from NCHW format to NHWC.
- */
-template <typename T>
-inline void nchw_to_nhwc(
-  const T* const in,
-  T* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride=0,
-  int in_channel_stride=0,
-  int in_row_stride=0,
-  int out_batch_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0
-)
-{
-  // Fill in the stride values
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
-  in_channel_stride = (in_channel_stride) ? in_channel_stride
-                                          : n_rows * in_row_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_channels * in_channel_stride;
-
-  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_rows * out_row_stride;
-
-  // Perform the re-ordering
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_row = in_batch + i*in_row_stride;
-      T* const out_row = out_batch + i*out_row_stride;
-
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_col = in_row + j;
-        T* const out_col = out_row + j*out_col_stride;
-
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_col + c*in_channel_stride;
-          out_col[c] = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-/* Re-order a tensor from NHWC format to NCHW.
- */
-template <typename T>
-inline void nhwc_to_nchw(
-  const T* const in,  // Input data in NHWC form
-  T* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride=0,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int out_batch_stride=0,
-  int out_channel_stride=0,
-  int out_row_stride=0
-)
-{
-  // Fill in stride values
-  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_rows * in_row_stride;
-
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
-  out_channel_stride = (out_channel_stride) ? out_channel_stride
-                                            : n_rows * out_row_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_channels * out_channel_stride;
-
-  // Perform the re-ordering
-  // For every batch
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    // For every row
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_i = in_batch + i*in_row_stride;
-      T* const out_i = out_batch + i*out_row_stride;
-
-      // For every column
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_j = in_i + j*in_col_stride;
-        T* const out_j = out_i + j;
-
-        // For every channel
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_j + c;
-          T* const out_channel = out_j + c*out_channel_stride;
-          *(out_channel) = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride,
-  int in_input_feature_map_stride,
-  int in_row_stride,
-  int out_row_stride,
-  int out_col_stride,
-  int out_input_feature_map_stride
-)
-{
-  // Fill in stride values
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols;
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_rows * in_row_stride;
-  in_output_feature_map_stride = (in_output_feature_map_stride)
-    ? in_output_feature_map_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_output_feature_maps;
-  out_col_stride = (out_col_stride)
-    ? out_col_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols * out_col_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j;
-      T* const out_col = out_row + j * out_col_stride;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm * in_output_feature_map_stride;
-          T* const out_ofm = out_ifm + ofm;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride,
-  int in_col_stride,
-  int in_input_feature_map_stride,
-  int out_output_feature_map_stride,
-  int out_input_feature_map_stride,
-  int out_row_stride
-)
-{
-  // Fill in the stride values
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_output_feature_maps;
-  in_col_stride = (in_col_stride)
-    ? in_col_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols * in_col_stride;
-
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols;
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_rows * out_row_stride;
-  out_output_feature_map_stride = (out_output_feature_map_stride)
-    ? out_output_feature_map_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* const out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j * in_col_stride;
-      T* const out_col = out_row + j;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm;
-          T* const out_ofm = out_ifm + ofm * out_output_feature_map_stride;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp
deleted file mode 100644
index ca8d012..0000000
--- a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3.hpp
+++ /dev/null

@@ -1,639 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include "arm_compute/core/NEON/kernels/winograd/tensor.hpp"
-
-
-namespace winograd {
-  /* Transform an input tensor into the Winograd domain.
-   */
-  template <typename T>
-  struct Winograd2x2_3x3GemmInput {
-    static void execute(
-        const T *inptr,
-        const Tensor4DShape& input_shape,
-        const PaddingType padding_type,
-        const int tile_M,
-        const int tile_N,
-        T *outptr_base,
-        const int matrix_stride,
-        const int matrix_batch_stride,
-        const int matrix_row_stride
-    );
-
-    static size_t bytes_read(const Tensor4DShape &input_shape,
-                           const Tensor4DShape &output_shape) {
-      const int tile_rows = iceildiv(output_shape.n_rows, 2);
-      const int tile_cols = iceildiv(output_shape.n_cols, 2);
-      return input_shape.n_batches * tile_rows * (16 + 8*(tile_cols - 1)) * input_shape.n_channels * sizeof(T);
-    }
-
-    static int flops_performed(const Tensor4DShape &input_shape,
-                                const Tensor4DShape &output_shape) {
-      const int tile_rows = iceildiv(output_shape.n_rows, 2);
-      const int tile_cols = iceildiv(output_shape.n_cols, 2);
-      return input_shape.n_batches * tile_rows * (32 + 24*(tile_cols - 1)) * input_shape.n_channels;
-    }
-
-    static size_t bytes_written(const Tensor4DShape &input_shape,
-                              const Tensor4DShape &output_shape) {
-      const int tile_rows = iceildiv(output_shape.n_rows, 2);
-      const int tile_cols = iceildiv(output_shape.n_cols, 2);
-      const int M = input_shape.n_batches * tile_rows * tile_cols;
-      return 16 * M * input_shape.n_channels * sizeof(T);
-    }
-
-    protected:
-    template <const PaddingType padding, const int pad_bottom, const int pad_right>
-    static void process_tile_tensor(
-        const int tile_M,      // Number of rows of tiles
-        const int tile_N,      // Number of columns of tiles
-        int n_channels,  // Number of input channels
-        const T* const input,  // Base input pointer (appropriate to batch and channel)
-        const int input_row_stride,  // Stride between rows of the input
-        const int input_col_stride,  // Stride between columns of the input
-        T* const matrix,              // 1st output matrix (appropriate to batch and channel)
-        const int matrix_stride,      // Stride between matrices
-        const int matrix_row_stride   // Stride between rows of the output matrix
-    );
-
-    template <const int pad_top, const int pad_left,
-              const int pad_bottom, const int pad_right,
-              const int proc_channels>
-    static void process_tile_row(
-        const int tile_N,      // Number of tiles in the row
-        const T* const input,  // Base input pointer (appropriate to batch, channel and row)
-        const int input_row_stride,  // Stride between rows of the input
-        const int input_col_stride,  // Stride between columns of the input
-        T* const matrix,              // 1st output matrix (appropriate to batch, channel and row)
-        const int matrix_stride,      // Stride between matrices
-        const int matrix_row_stride   // Stride between rows of the output matrix
-    );
-  };
-
-  template <typename T>
-  struct Winograd2x2_3x3GemmInputChannelwise {
-    static void execute(
-        const T *inptr,
-        const Tensor4DShape& input_shape,
-        const PaddingType padding_type,
-        const int tile_M,
-        const int tile_N,
-        T *outptr_base,
-        const int matrix_stride,
-        const int matrix_batch_stride,
-        const int matrix_row_stride
-    );
-
-    static size_t bytes_read(const Tensor4DShape &input_shape,
-                           const Tensor4DShape &output_shape) {
-      // We read as many bytes as we write
-      return bytes_written(input_shape, output_shape);
-    }
-
-    static int flops_performed(const Tensor4DShape &input_shape,
-                                const Tensor4DShape &output_shape) {
-      const int tile_rows = iceildiv(output_shape.n_rows, 2);
-      const int tile_cols = iceildiv(output_shape.n_cols, 2);
-      return input_shape.n_batches * tile_rows * 32 * tile_cols * input_shape.n_channels;
-    }
-
-    static size_t bytes_written(const Tensor4DShape &input_shape,
-                              const Tensor4DShape &output_shape) {
-      return winograd::Winograd2x2_3x3GemmInput<T>::bytes_written(input_shape, output_shape);
-    }
-
-    protected:
-    typedef void (*tilefunc)(int, const T*, int, int, T*, int);
-    template <const int pad_top,
-              const int pad_left,
-              const int pad_bottom,
-              const int pad_right>
-    static void process_tile(
-        int n_channels,  // Number of channels in the tile
-        const T* const input_base,
-        const int input_row_stride,
-        const int input_col_stride,
-        T* const matrix_base,
-        const int matrix_stride
-    );
-
-    private:
-    template <const int pad_top,
-              const int pad_left,
-              const int pad_bottom,
-              const int pad_right,
-              const int proc_channels>
-    static void _process_tile(
-        int &n_channels, const T* &inptr,
-        const int input_row_stride, const int input_col_stride,
-        T* &outptr, const int matrix_stride
-    );
-  };
-}
-
-/*****************************************************************************/
-// Include specialised implementations here
-#include "input_2x2_3x3/a64_float.hpp"
-#include "input_2x2_3x3/a64_float_channelwise.hpp"
-/*****************************************************************************/
-
-/*****************************************************************************/
-template <typename T>
-void winograd::Winograd2x2_3x3GemmInput<T>::execute(
-    const T *inptr_base,
-    const Tensor4DShape& input_shape,
-    const PaddingType padding_type,
-    const int tile_M,
-    const int tile_N,
-    T *outptr_base,
-    const int matrix_stride,
-    const int matrix_batch_stride,
-    const int matrix_row_stride
-) {
-  // Select an appropriate matrix processing method for the shape and padding
-  // of the input tensor.
-  typedef void (*tensorfunc)(int, int, int, const T*, int, int, T*, int, int);
-  const auto process_tensor = [&padding_type, &input_shape] () -> tensorfunc {
-    if (padding_type == PADDING_VALID) {
-      const int pad_bottom = input_shape.n_rows % 2;
-      const int pad_right = input_shape.n_cols % 2;
-
-      if (pad_bottom == 0 && pad_right == 0) {
-        return process_tile_tensor<PADDING_VALID, 0, 0>;
-      } else if (pad_bottom == 0 && pad_right == 1) {
-        return process_tile_tensor<PADDING_VALID, 0, 1>;
-      } else if (pad_bottom == 1 && pad_right == 0) {
-        return process_tile_tensor<PADDING_VALID, 1, 0>;
-      } else if (pad_bottom == 1 && pad_right == 1) {
-        return process_tile_tensor<PADDING_VALID, 1, 1>;
-      }
-    } else {  // PADDING_SAME
-      const int pad_bottom = 1 + input_shape.n_rows % 2;
-      const int pad_right = 1 + input_shape.n_cols % 2;
-
-      if (pad_bottom == 1 && pad_right == 1) {
-        return process_tile_tensor<PADDING_SAME, 1, 1>;
-      } else if (pad_bottom == 1 && pad_right == 2) {
-        return process_tile_tensor<PADDING_SAME, 1, 2>;
-      } else if (pad_bottom == 2 && pad_right == 1) {
-        return process_tile_tensor<PADDING_SAME, 2, 1>;
-      } else if (pad_bottom == 2 && pad_right == 2) {
-        return process_tile_tensor<PADDING_SAME, 2, 2>;
-      }
-    }
-
-    printf("%s::%u Uncovered case.\n", __FILE__, __LINE__);
-    exit(-1);
-    return NULL;  // No function found
-  } ();
-
-  // Compute strides
-  const int input_row_stride = input_shape.n_cols * input_shape.n_channels;
-  const int input_col_stride = input_shape.n_channels;
-
-  // Process each batch of the tensor in turn.
-  for (int batch = 0; batch < input_shape.n_batches; batch++) {
-    // Work out pointers
-    const T *inptr = inptr_base + (batch * input_shape.n_rows *
-                                   input_shape.n_cols * input_shape.n_channels);
-    T *outptr = outptr_base + batch * matrix_batch_stride;
-
-    // Delegate doing the actual work
-    process_tensor(
-      tile_M, tile_N, input_shape.n_channels,
-      inptr, input_row_stride, input_col_stride,
-      outptr, matrix_stride, matrix_row_stride
-    );
-  }
-}
-
-/*****************************************************************************/
-template <typename T>
-template <const PaddingType padding, const int pad_bottom, const int pad_right>
-void winograd::Winograd2x2_3x3GemmInput<T>::process_tile_tensor(
-    const int tile_M,      // Number of rows of tiles
-    const int tile_N,      // Number of columns of tiles
-    int n_channels,  // Number of input channels
-    const T* const input,  // Base input pointer (appropriate to batch and channel)
-    const int input_row_stride,  // Stride between rows of the input
-    const int input_col_stride,  // Stride between columns of the input
-    T* const matrix,              // 1st output matrix (appropriate to batch and channel)
-    const int matrix_stride,      // Stride between matrices
-    const int matrix_row_stride   // Stride between rows of the output matrix
-) {
-  // Base row processing functions
-  typedef void (*rowfunc)(int, const T*, int, int, T*, int, int);
-  const rowfunc process_top_row[3] = {
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 1>
-      : process_tile_row<1, 1, 0, pad_right, 1>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 2>
-      : process_tile_row<1, 1, 0, pad_right, 2>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 4>
-      : process_tile_row<1, 1, 0, pad_right, 4>,
-  };
-  const rowfunc process_middle_row[3] = {
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 1>
-      : process_tile_row<0, 1, 0, pad_right, 1>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 2>
-      : process_tile_row<0, 1, 0, pad_right, 2>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, 0, pad_right, 4>
-      : process_tile_row<0, 1, 0, pad_right, 4>,
-  };
-  const rowfunc process_bottom_row[3] = {
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, pad_bottom, pad_right, 1>
-      : process_tile_row<0, 1, pad_bottom, pad_right, 1>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, pad_bottom, pad_right, 2>
-      : process_tile_row<0, 1, pad_bottom, pad_right, 2>,
-    (padding == PADDING_VALID)
-      ? process_tile_row<0, 0, pad_bottom, pad_right, 4>
-      : process_tile_row<0, 1, pad_bottom, pad_right, 4>,
-  };
-
-  // Method to get an input pointer for the given tile row
-  const auto get_inptr = [&input, &input_row_stride] (const int tile_i) {
-    if (padding == PADDING_VALID) {
-      return input + 2 * tile_i * input_row_stride;
-    } else {
-      return input + (2 * tile_i - (tile_i ? 1 : 0)) * input_row_stride;
-    }
-  };
-
-  // Wrapper to process a row of tiles, covering all channels.
-  const auto process_row =
-    [tile_N, input_row_stride, input_col_stride, matrix_stride, matrix_row_stride, n_channels]
-    (const rowfunc f[3], const T *inptr, T *outptr) {
-      int rem_channels = n_channels;
-
-      // While there remain channels to process continue to process the
-      // row.
-      for (; rem_channels >= 4; rem_channels -= 4, inptr += 4, outptr += 4) {
-        f[2](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
-      }
-      for (; rem_channels >= 2; rem_channels -= 2, inptr += 2, outptr += 2) {
-        f[1](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
-      }
-      if (rem_channels) {
-        f[0](tile_N, inptr, input_row_stride, input_col_stride, outptr, matrix_stride, matrix_row_stride);
-      }
-  };
-
-  // Process all rows of tiles in the tensor
-  for (int tile_i = 0; tile_i < tile_M; tile_i++) {
-    T* const m_row = matrix + tile_i * tile_N * matrix_row_stride;
-    const T *row_inptr = get_inptr(tile_i);
-
-    if (tile_i == 0) {
-      // Top row of the input
-      process_row(process_top_row, row_inptr, m_row);
-    } else if (tile_i == tile_M - 1) {
-      // Bottom row of the input
-      process_row(process_bottom_row, row_inptr, m_row);
-    } else {
-      // Any other row of the input
-      process_row(process_middle_row, row_inptr, m_row);
-    }
-  }
-}
-
-/*****************************************************************************/
-template <typename T>
-template <const int pad_top, const int pad_left,
-          const int pad_bottom, const int pad_right,
-          const int proc_channels>
-void winograd::Winograd2x2_3x3GemmInput<T>::process_tile_row(
-    const int tile_N,      // Number of tiles in the row
-    const T* const input,  // Base input pointer (appropriate to batch, channel and row)
-    const int input_row_stride,  // Stride between rows of the input
-    const int input_col_stride,  // Stride between columns of the input
-    T* const matrix,              // 1st output matrix (appropriate to batch, channel and row)
-    const int matrix_stride,      // Stride between matrices
-    const int matrix_row_stride   // Stride between rows of the output matrix
-) {
-  // Construct copies of the pointers
-  const T *inptr = input;
-  T *outptr = matrix;
-
-  // Storage for the tensors x, X.T x, and X.T x X.
-  T x[4][4][proc_channels], XTx[4][4][proc_channels], XTxX[4][4][proc_channels];
-
-  // For every tile in the row
-  for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-    // Determine the padding for the tile
-    const int tile_pad_left = (tile_j == 0) ? pad_left : 0;
-    const int tile_pad_right = (tile_j == tile_N - 1) ? pad_right : 0;
-
-    // Load tile values. If this is the first tile in the row then we must load
-    // all values, otherwise we can just load the final two columns of the input.
-    for (int i = 0; i < 4; i++) {
-      for (int j = ((tile_j == 0) ? 0 : 2); j < 4; j++) {
-        // Fill with padding if required
-        if (i < pad_top || 4 - pad_bottom <= i ||
-            j < tile_pad_left || 4 - tile_pad_right <= j) {
-          for (int c = 0; c < proc_channels; c++) {
-            x[i][j][c] = static_cast<T>(0);  // Padding
-          }
-        } else {
-          // Load values, note that the initial padding offsets the pointer we
-          // were provided.
-          for (int c = 0; c < proc_channels; c++) {
-            const int row_offset = (i - pad_top) * input_row_stride;
-            const int col_offset = (j - tile_pad_left) * input_col_stride;
-            x[i][j][c] = inptr[row_offset + col_offset + c];
-          }
-        }
-      }
-    }
-
-    // Compute the matrix X.T x.  Note, can elide operations depending on the
-    // padding. Furthermore, if this isn't the left-most tile we can skip half
-    // of the operations by copying results from the previous version of X.T x.
-    // This latter optimisation can be simplified by unrolling the outermost
-    // loop by two and by renaming the registers containing XTx.
-    if (tile_j == 0) {
-      for (int j = 0; j < 4; j++) {
-        for (int c = 0; c < proc_channels; c++) {
-          XTx[0][j][c] =  x[0][j][c] - x[2][j][c];
-          XTx[1][j][c] =  x[1][j][c] + x[2][j][c];
-          XTx[2][j][c] = -x[1][j][c] + x[2][j][c];
-          XTx[3][j][c] =  x[1][j][c] - x[3][j][c];
-        }
-      }
-    } else {
-      for (int j = 0; j < 2; j++) {
-        for (int c = 0; c < proc_channels; c++) {
-          XTx[0][j][c] = XTx[0][j + 2][c];
-          XTx[1][j][c] = XTx[1][j + 2][c];
-          XTx[2][j][c] = XTx[2][j + 2][c];
-          XTx[3][j][c] = XTx[3][j + 2][c];
-        }
-      }
-      for (int j = 2; j < 4; j++) {
-        for (int c = 0; c < proc_channels; c++) {
-          XTx[0][j][c] =  x[0][j][c] - x[2][j][c];
-          XTx[1][j][c] =  x[1][j][c] + x[2][j][c];
-          XTx[2][j][c] = -x[1][j][c] + x[2][j][c];
-          XTx[3][j][c] =  x[1][j][c] - x[3][j][c];
-        }
-      }
-    }
-
-    // Compute the matrix X.T x X. Note, can elide operations based on the
-    // padding.
-    for (int i = 0; i < 4; i++) {
-      for (int c = 0; c < proc_channels; c++) {
-        XTxX[i][0][c] =  XTx[i][0][c] - XTx[i][2][c];
-        XTxX[i][1][c] =  XTx[i][1][c] + XTx[i][2][c];
-        XTxX[i][2][c] = -XTx[i][1][c] + XTx[i][2][c];
-        XTxX[i][3][c] =  XTx[i][1][c] - XTx[i][3][c];
-      }
-    }
-
-    // Store the output matrix (X.T x X)
-    for (int i = 0; i < 4; i++) {
-      for (int j = 0; j < 4; j++) {
-        // Get a pointer to the relevant output matrix
-        T *mptr = outptr + (i*4 + j)*matrix_stride;
-
-        // Write out the channels
-        for (int c = 0; c < proc_channels; c++) {
-          mptr[c] = XTxX[i][j][c];
-        }
-      }
-    }
-
-    // Update the pointers
-    inptr += input_col_stride * ((tile_j == 0 && pad_left) ? 1 : 2);
-    outptr += matrix_row_stride;
-  }
-}
-
-/*****************************************************************************/
-template <typename T>
-void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::execute(
-    const T *inptr,
-    const Tensor4DShape& input_shape,
-    const PaddingType padding_type,
-    const int tile_M,
-    const int tile_N,
-    T *outptr_base,
-    const int matrix_stride,
-    const int matrix_batch_stride,
-    const int matrix_row_stride
-) {
-  const int n_channels = input_shape.n_channels;
-  const int input_col_stride = n_channels;
-  const int input_row_stride = input_shape.n_cols * input_col_stride;
-
-  // Determine the padding and hence select appropriate methods for each tile.
-  tilefunc fs[3][3];
-
-  if (padding_type == PADDING_VALID) {
-    constexpr int pad_top = 0;
-    constexpr int pad_left = 0;
-    const int pad_right = input_shape.n_cols % 2 == 0;
-
-    fs[0][0] = process_tile<pad_top, pad_left, 0, 0>;
-    fs[0][1] = process_tile<pad_top, 0, 0, 0>;
-    fs[0][2] = (pad_right) ? process_tile<pad_top, 0, 0, 0> : process_tile<pad_top, 0, 0, 1>;
-
-    fs[1][0] = process_tile<0, pad_left, 0, 0>;
-    fs[1][1] = process_tile<0, 0, 0, 0>;
-    fs[1][2] = (pad_right) ? process_tile<0, 0, 0, 0> : process_tile<0, 0, 0, 1>;
-
-    if (input_shape.n_rows % 2 == 0) {
-      constexpr int pad_bottom = 0;
-      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
-      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
-      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 0> : process_tile<0, 0, pad_bottom, 1>;
-    } else {
-      constexpr int pad_bottom = 1;
-      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
-      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
-      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 0> : process_tile<0, 0, pad_bottom, 1>;
-    }
-  } else {
-    constexpr int pad_top = 1;
-    constexpr int pad_left = 1;
-    const int pad_right = input_shape.n_cols % 2 == 0;
-
-    fs[0][0] = process_tile<pad_top, pad_left, 0, 0>;
-    fs[0][1] = process_tile<pad_top, 0, 0, 0>;
-    fs[0][2] = (pad_right) ? process_tile<pad_top, 0, 0, 1> : process_tile<pad_top, 0, 0, 2>;
-
-    fs[1][0] = process_tile<0, pad_left, 0, 0>;
-    fs[1][1] = process_tile<0, 0, 0, 0>;
-    fs[1][2] = (pad_right) ? process_tile<0, 0, 0, 1> : process_tile<0, 0, 0, 2>;
-
-    if (input_shape.n_rows % 2 == 0) {
-      constexpr int pad_bottom = 1;
-      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
-      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
-      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 1> : process_tile<0, 0, pad_bottom, 2>;
-    } else {
-      constexpr int pad_bottom = 2;
-      fs[2][0] = process_tile<0, pad_left, pad_bottom, 0>;
-      fs[2][1] = process_tile<0, 0, pad_bottom, 0>;
-      fs[2][2] = (pad_right) ? process_tile<0, 0, pad_bottom, 1> : process_tile<0, 0, pad_bottom, 2>;
-    }
-  }
-
-  // Process each tile in turn
-  for (int batch = 0; batch < input_shape.n_batches; batch++) {
-    const T* const input_base_batch = inptr + batch*input_shape.n_rows*input_shape.n_cols*n_channels;
-
-    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
-      const int row_offset = (tile_i == 0) ? 0 : ((padding_type == PADDING_VALID) ? 0 : 1);
-      const T* const input_base_row = input_base_batch + (2*tile_i - row_offset)*input_shape.n_cols*n_channels;
-
-      // Select the set of functions for the row
-      const int fs_i = (tile_i == 0) ? 0 : ((tile_i < tile_M - 1) ? 1 : 2);
-
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        // Select the function for the column
-        const int fs_j = (tile_j == 0) ? 0 : ((tile_j < tile_N - 1) ? 1 : 2);
-        const auto f = fs[fs_i][fs_j];
-
-        // Get pointers into the input and outputs
-        const int col_offset = (tile_j == 0) ? 0 : ((padding_type == PADDING_VALID) ? 0 : 1);
-        const T* const input_base_col = input_base_row + (2*tile_j - col_offset)*n_channels;
-        T* const matrix_base = outptr_base + batch*matrix_batch_stride + (tile_i*tile_N + tile_j)*matrix_row_stride;
-        f(n_channels, input_base_col, input_row_stride, input_col_stride,
-          matrix_base, matrix_stride);
-      }
-    }
-  }
-}
-
-template <typename T>
-template <const int pad_top,
-          const int pad_left,
-          const int pad_bottom,
-          const int pad_right>
-void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::process_tile(
-    int n_channels,  // Number of channels in the tile
-    const T* const input_base,
-    const int input_row_stride,
-    const int input_col_stride,
-    T* const matrix_base,
-    const int matrix_stride
-) {
-  // Copy pointers
-  const T *inptr = input_base;
-  T *outptr = matrix_base;
-
-  // Process channels (modifies inptr, outptr and n_channels)
-  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 4>(
-    n_channels, inptr, input_row_stride, input_col_stride,
-    outptr, matrix_stride
-  );
-  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 2>(
-    n_channels, inptr, input_row_stride, input_col_stride,
-    outptr, matrix_stride
-  );
-  _process_tile<pad_top, pad_left, pad_bottom, pad_right, 1>(
-    n_channels, inptr, input_row_stride, input_col_stride,
-    outptr, matrix_stride
-  );
-}
-
-template <typename T>
-template <const int pad_top,
-          const int pad_left,
-          const int pad_bottom,
-          const int pad_right,
-          const int proc_channels>
-void winograd::Winograd2x2_3x3GemmInputChannelwise<T>::_process_tile(
-    int &n_channels,
-    const T* &inptr, const int input_row_stride, const int input_col_stride,
-    T* &outptr, const int matrix_stride
-) {
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  T* outptrs[4] = {
-    outptr,
-    outptr + matrix_stride * 4,
-    outptr + matrix_stride * 8,
-    outptr + matrix_stride * 12
-  };
-
-  // The matrix X; zeroed to account for padding.
-  T x[4][4];
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      x[i][j] = 0;
-    }
-  }
-
-  // The matrices X.T x and U
-  T XTx[4][4], U[4][4];
-
-  // Now progress through each channel
-  for (; n_channels >= proc_channels; n_channels -= proc_channels) {
-    for (int n = 0; n < proc_channels; n++) {
-      // Load the matrix X
-      for (int cell_i = pad_top, i = 0; cell_i < 4 - pad_bottom; cell_i++, i++) {
-        for (int cell_j = pad_left, j = 0; cell_j < 4 - pad_right; cell_j++, j++) {
-          x[cell_i][cell_j] = inptr[i*input_row_stride + j*input_col_stride];
-        }
-      }
-      inptr++;
-
-      // Compute the matrix X.T
-      for (int j = 0; j < 4; j++) {
-        XTx[0][j] = x[0][j] - x[2][j];
-        XTx[1][j] = x[1][j] + x[2][j];
-        XTx[2][j] = x[2][j] - x[1][j];
-        XTx[3][j] = x[1][j] - x[3][j];
-      }
-
-      // Hence compute the matrix U
-      for (int i = 0; i < 4; i++) {
-        U[i][0] = XTx[i][0] - XTx[i][2];
-        U[i][1] = XTx[i][1] + XTx[i][2];
-        U[i][2] = XTx[i][2] - XTx[i][1];
-        U[i][3] = XTx[i][1] - XTx[i][3];
-      }
-
-      // Store the matrix U
-      for (int i = 0; i < 4; i++) {
-        for (int j = 0; j < 4; j++) {
-          outptrs[i][j * matrix_stride] = U[i][j];
-        }
-        outptrs[i]++;
-      }
-    }
-  }
-
-  // Update the output pointer for future calls
-  outptr = outptrs[0];
-}

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp
deleted file mode 100644
index 6c7f136..0000000
--- a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float.hpp
+++ /dev/null

@@ -1,1491 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include "../input_2x2_3x3.hpp"
-
-#ifdef __aarch64__
-namespace winograd {
-
-// Pad left by one column, pad right by one column, no upper or lower padding, 4 channels
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<0, 1, 0, 1, 4>(
-    const int tile_N,            // Number of tiles in the row
-    const float* const input,    // Base input pointer (appropriate to batch, channel and row)
-    const int input_row_stride,  // Stride between rows of the input
-    const int input_col_stride,  // Stride between columns of the input
-    float* const matrix,         // 1st output matrix (appropriate to batch, channel and row)
-    const int matrix_stride,     // Stride between matrices
-    const int matrix_row_stride  // Stride between rows of the output matrix
-) {
-  /* SIMD register allocation
-   * ========================
-   *
-   * In the following code we read 4x4 tiles of a matrix `x`, with which we
-   * compute another matrix `X.T x` where:
-   *
-   *         /  1  0  0  0 \
-   *     X = |  0  1 -1  1 |
-   *         | -1  1  1  0 |
-   *         \  0  0  0 -1 /
-   *
-   * Hence, `X.T` is a program which operates upon rows of the matrix `X`.
-   * We subsequently compute and store the matrix `U = (X.T x) X`.
-   *
-   * Importantly, each iteration of the loop below loads a new matrix `x'`
-   * where the final two columns of `x'` are the first two columns of the
-   * previous `x`. That is:
-   *
-   *   x11  x12  x13  x14
-   *   x21  x22  x23  x24
-   *   x31  x32  x33  x34
-   *   x41  x42  x43  x44
-   *
-   *            x'11 x'12 x'13 x'14
-   *            x'21 x'22 x'23 x'24
-   *            x'31 x'32 x'33 x'34
-   *            x'41 x'42 x'43 x'44
-   *
-   * Consequently, while the first iteration of the below loop must load 16
-   * values for `x`, the second need load only 8. *Furthermore*, since we noted
-   * above that the operation `X.T x` was a program which operated upon *rows*
-   * of the matrix `x` it follows that that the relation that `x'[i][1] =
-   * x[i][3]` and `x'[i][2] = x[i][4]` applies also the matrices `X.T x'` and
-   * `X.T x`. That is:
-   *
-   *   (X.T x)11  (X.T x)12  (X.T x)13  (X.T x)14
-   *   (X.T x)21  (X.T x)22  (X.T x)23  (X.T x)24
-   *   (X.T x)31  (X.T x)32  (X.T x)33  (X.T x)34
-   *   (X.T x)41  (X.T x)42  (X.T x)43  (X.T x)44
-   *
-   *                        (X.T x')11 (X.T x')12 (X.T x')13 (X.T x')14
-   *                        (X.T x')12 (X.T x')12 (X.T x')12 (X.T x')12
-   *                        (X.T x')13 (X.T x')13 (X.T x')13 (X.T x')13
-   *                        (X.T x')14 (X.T x')14 (X.T x')14 (X.T x')14
-   *
-   * Hence, as well as not needing to load new values for x'[i][1..2] it is
-   * also unnecessary to recompute values for (X.T x')[i][1..2].
-   *
-   * Following this we break the registers into blocks `A` and `B` used by the
-   * two stages of the unrolled loop. These registers named such that the
-   * latter columns of `A` become the earlier columns of `B` and vice-versa:
-   *
-   *  AXTx11 AXTx12 > AXTx13 AXTx14 |
-   *  AXTx21 AXTx22 > AXTx23 AXTx24 |
-   *  AXTx31 AXTx32 > AXTx33 AXTx34 |
-   *  AXTx41 AXTx42 > AXTx43 AXTx44 |
-   *
-   *  BXTx13 BXTx14 | BXTx11 BXTx12 >
-   *  BXTx23 BXTx24 | BXTx21 BXTx22 >
-   *  BXTx33 BXTx34 | BXTx31 BXTx32 >
-   *  BXTx43 BXTx44 | BXTx41 BXTx42 >
-   *
-   * These 32 named registers require only 16 architectural registers. 1
-   * additional architectural register is used as scratch space and 8
-   * architectural registers are used to load in the values x[1..4][3,4].
-   *
-   * Input and output addressing
-   * ===========================
-   */
-  const float *inptr0 = input;
-  const float *inptr1 = input + input_row_stride;
-  const float *inptr2 = input + input_row_stride * 2;
-  const float *inptr3 = input + input_row_stride * 3;
-
-  float *outptr0 = matrix;
-  float *outptr4 = matrix + matrix_stride * 4;
-  float *outptr8 = matrix + matrix_stride * 8;
-  float *outptr12 = matrix + matrix_stride * 12;
-
-  int tile_j = tile_N;  // Tiles to process
-
-  asm volatile (
-      // Named SIMD registers according to the policy given above
-      // Registers into which to load the latter two columns of `x`
-      "x_13 .req v0\n qx_13 .req q0\n" "x_14 .req v4\n qx_14 .req q4\n"
-      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
-      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
-      "x_43 .req v3\n qx_43 .req q3\n" "x_44 .req v7\n qx_44 .req q7\n"
-
-      // Registers for storing X.T x (both A and B halves)
-      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
-      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
-      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
-      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
-      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
-      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
-      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
-      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
-      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
-      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
-      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
-      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
-      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
-      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
-      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
-      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
-
-      "U .req v24\n qU .req q24\n"
-
-      // ----------------------------------------------------------------------
-      // Head of loop
-      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
-      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
-      //   NOTE: Since the first tile has the leftmost column padded we can
-      //   skip 4 loads and 4 calculations for the matrix X.T x X.
-
-      // Temporarily alias registers for computing the first (non-padded)
-      // column of x.
-      "x_12 .req v0\n qx_12 .req q0\n"
-      "x_22 .req v1\n qx_22 .req q1\n"
-      "x_32 .req v2\n qx_32 .req q2\n"
-      "x_42 .req v3\n qx_42 .req q3\n"
-
-      "ldr qx_12, [%x[inptr0]]\n"
-      "ldr qx_22, [%x[inptr1]]\n"
-      "ldr qx_32, [%x[inptr2]]\n"
-      "ldr qx_42, [%x[inptr3]]\n"
-
-      "fsub BXTx12.4s, x_12.4s, x_32.4s\n"
-      "fadd BXTx22.4s, x_22.4s, x_32.4s\n"
-      "fsub BXTx32.4s, x_32.4s, x_22.4s\n"
-      "fsub BXTx42.4s, x_22.4s, x_42.4s\n"
-
-      ".unreq x_12\n .unreq qx_12\n"
-      ".unreq x_22\n .unreq qx_22\n"
-      ".unreq x_32\n .unreq qx_32\n"
-      ".unreq x_42\n .unreq qx_42\n"
-
-      // Load and compute latter two columns of the first tile. Progress the
-      // input pointers (by three columns so that the each points are the
-      // second column of the next tile, that is, each points at the first
-      // column which must be read for the next tile.
-      "ldr qx_13, [%x[inptr0], %x[colstride1]]\n"
-      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
-      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
-      "ldr qx_43, [%x[inptr3], %x[colstride1]]\n"
-
-      "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-      "ldr qx_14, [%x[inptr0], %x[colstride2]]\n"
-
-      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
-
-      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
-
-      "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-      "ldr qx_44, [%x[inptr3], %x[colstride2]]\n"
-
-      "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
-      "add %x[inptr0],  %x[inptr0], %x[colstride3]\n"
-
-      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
-
-      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
-
-      "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
-      "add %x[inptr3], %x[inptr3], %x[colstride3]\n"
-
-      // Compute and store U for the first tile
-      // First row
-      "fneg U.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0]]\n"
-      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0], %x[mstride1]]\n"
-      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-      "str qU, [%x[outptr0], %x[mstride2]]\n"
-      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-      "str qU, [%x[outptr0], %x[mstride3]]\n"
-      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-      // Second row
-      "fneg U.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4]]\n"
-      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4], %x[mstride1]]\n"
-      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-      "str qU, [%x[outptr4], %x[mstride2]]\n"
-      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-      "str qU, [%x[outptr4], %x[mstride3]]\n"
-      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-      // Third row
-      "fneg U.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8]]\n"
-      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8], %x[mstride1]]\n"
-      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-      "str qU, [%x[outptr8], %x[mstride2]]\n"
-      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-      "str qU, [%x[outptr8], %x[mstride3]]\n"
-      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-      // Fourth row, simultaneously load the first column of inputs for the
-      // next tile.
-      "fneg U.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12]]\n"
-      "ldr qx_13, [%x[inptr0]]\n"
-
-      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12], %x[mstride1]]\n"
-      "ldr qx_23, [%x[inptr1]]\n"
-
-      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-      "str qU, [%x[outptr12], %x[mstride2]]\n"
-      "ldr qx_33, [%x[inptr2]]\n"
-
-      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-      "str qU, [%x[outptr12], %x[mstride3]]\n"
-      "ldr qx_43, [%x[inptr3]]\n"
-
-      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-      // Update the loop counter, subtract two to account for both the head and
-      // the tail.
-      "subs %x[tile_j], %x[tile_j], #2\n"
-      "beq 2f\n"  // Jump to "A" tail if out of tiles
-
-      // ----------------------------------------------------------------------
-      "1:"
-        // Start part A
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
-        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
-        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
-        "fsub AXTx14.4s, x_14.4s, x_34.4s\n"
-        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
-        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "fsub AXTx44.4s, x_24.4s, x_44.4s\n"
-        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "ldr qx_13, [%x[inptr0]]\n"
-
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "ldr qx_43, [%x[inptr3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "beq 3f\n"  // Jump to 'B' tail
-
-        // Start part B
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
-        "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
-        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
-        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
-        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "ldr qx_13, [%x[inptr0]]\n"
-
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "ldr qx_43, [%x[inptr3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
-
-      // ----------------------------------------------------------------------
-      "2:"
-        // 'A' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-        "b 4f\n"  // Jump to end of function
-
-      // ----------------------------------------------------------------------
-      "3:"
-        // 'B' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-      // ----------------------------------------------------------------------
-      "4:"
-        // End of function
-
-      // Clear names
-      ".unreq x_13\n" ".unreq qx_13\n" ".unreq x_14\n" ".unreq qx_14\n"
-      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
-      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
-      ".unreq x_43\n" ".unreq qx_43\n" ".unreq x_44\n" ".unreq qx_44\n"
-      ".unreq AXTx11\n" ".unreq BXTx13\n"
-      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
-      ".unreq AXTx21\n" ".unreq BXTx23\n"
-      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
-      ".unreq AXTx31\n" ".unreq BXTx33\n"
-      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
-      ".unreq AXTx41\n" ".unreq BXTx43\n"
-      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
-      ".unreq AXTx13\n" ".unreq BXTx11\n"
-      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
-      ".unreq AXTx23\n" ".unreq BXTx21\n"
-      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
-      ".unreq AXTx33\n" ".unreq BXTx31\n"
-      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
-      ".unreq AXTx43\n" ".unreq BXTx41\n"
-      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
-      ".unreq U\n" ".unreq qU\n"
-    : [inptr0] "+r" (inptr0),
-      [inptr1] "+r" (inptr1),
-      [inptr2] "+r" (inptr2),
-      [inptr3] "+r" (inptr3),
-      [outptr0] "+r" (outptr0),
-      [outptr4] "+r" (outptr4),
-      [outptr8] "+r" (outptr8),
-      [outptr12] "+r" (outptr12),
-      [tile_j] "+r" (tile_j)  // Tile counter
-    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
-      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
-      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
-      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
-      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
-      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
-      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-      "v22", "v23", "v24"
-  );
-}
-
-// Pad top, left and right by 1.
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<1, 1, 0, 1, 4>(
-    const int tile_N,
-    const float* const input,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* const matrix,
-    const int matrix_stride,
-    const int matrix_row_stride
-) {
-  const float *inptr0 = input;
-  const float *inptr1 = input + input_row_stride;
-  const float *inptr2 = input + input_row_stride * 2;
-
-  float *outptr0 = matrix;
-  float *outptr4 = matrix + matrix_stride * 4;
-  float *outptr8 = matrix + matrix_stride * 8;
-  float *outptr12 = matrix + matrix_stride * 12;
-
-  int tile_j = tile_N;  // Tiles to process
-
-  asm volatile (
-      // Named SIMD registers according to the policy given above
-      // Registers into which to load the latter two columns of `x`
-      // NOTE: We need only load the latter three rows since we know that the
-      // first row is padded.
-      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
-      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
-      "x_43 .req v3\n qx_43 .req q3\n" "x_44 .req v7\n qx_44 .req q7\n"
-
-      // Registers for storing X.T x (both A and B halves)
-      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
-      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
-      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
-      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
-      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
-      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
-      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
-      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
-      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
-      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
-      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
-      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
-      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
-      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
-      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
-      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
-
-      "U .req v24\n qU .req q24\n"
-
-      // ----------------------------------------------------------------------
-      // Head of loop
-      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
-      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
-      //   NOTE: Since the first tile has the leftmost column padded we can
-      //   skip 4 loads and 4 calculations for the matrix X.T x X.
-
-      // Temporarily alias registers for computing the first (non-padded)
-      // column of x.
-      "x_22 .req v1\n qx_22 .req q1\n"
-      "x_32 .req v2\n qx_32 .req q2\n"
-      "x_42 .req v3\n qx_42 .req q3\n"
-
-      "ldr qx_22, [%x[inptr1]]\n"
-      "ldr qx_32, [%x[inptr2]]\n"
-      "ldr qx_42, [%x[inptr3]]\n"
-
-      "fneg BXTx12.4s,          x_32.4s\n"
-      "fadd BXTx22.4s, x_22.4s, x_32.4s\n"
-      "fsub BXTx32.4s, x_32.4s, x_22.4s\n"
-      "fsub BXTx42.4s, x_22.4s, x_42.4s\n"
-
-      ".unreq x_22\n .unreq qx_22\n"
-      ".unreq x_32\n .unreq qx_32\n"
-      ".unreq x_42\n .unreq qx_42\n"
-
-      // Load and compute latter two columns of the first tile. Progress the
-      // input pointers (by three columns so that the each points are the
-      // second column of the next tile, that is, each points at the first
-      // column which must be read for the next tile.
-      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
-      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
-      "ldr qx_43, [%x[inptr3], %x[colstride1]]\n"
-
-      "fneg BXTx13.4s,          x_33.4s\n"
-
-      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
-
-      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
-
-      "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-      "ldr qx_44, [%x[inptr3], %x[colstride2]]\n"
-
-      "fneg BXTx14.4s,          x_34.4s\n"
-
-      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
-
-      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
-
-      "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
-      "add %x[inptr3], %x[inptr3], %x[colstride3]\n"
-
-      // Compute and store U for the first tile
-      // First row
-      "fneg U.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0]]\n"
-      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0], %x[mstride1]]\n"
-      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-      "str qU, [%x[outptr0], %x[mstride2]]\n"
-      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-      "str qU, [%x[outptr0], %x[mstride3]]\n"
-      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-      // Second row
-      "fneg U.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4]]\n"
-      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4], %x[mstride1]]\n"
-      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-      "str qU, [%x[outptr4], %x[mstride2]]\n"
-      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-      "str qU, [%x[outptr4], %x[mstride3]]\n"
-      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-      // Third row
-      "fneg U.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8]]\n"
-      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8], %x[mstride1]]\n"
-      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-      "str qU, [%x[outptr8], %x[mstride2]]\n"
-      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-      "str qU, [%x[outptr8], %x[mstride3]]\n"
-      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-      // Fourth row, simultaneously load the first column of inputs for the
-      // next tile.
-      "fneg U.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12]]\n"
-
-      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12], %x[mstride1]]\n"
-      "ldr qx_23, [%x[inptr1]]\n"
-
-      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-      "str qU, [%x[outptr12], %x[mstride2]]\n"
-      "ldr qx_33, [%x[inptr2]]\n"
-
-      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-      "str qU, [%x[outptr12], %x[mstride3]]\n"
-      "ldr qx_43, [%x[inptr3]]\n"
-
-      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-      // Update the loop counter, subtract two to account for both the head and
-      // the tail.
-      "subs %x[tile_j], %x[tile_j], #2\n"
-      "beq 2f\n"  // Jump to "A" tail if out of tiles
-
-      // ----------------------------------------------------------------------
-      "1:"
-        // Start part A
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fneg AXTx13.4s,          x_33.4s\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
-        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
-        "fneg AXTx14.4s,          x_34.4s\n"
-        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "fsub AXTx44.4s, x_24.4s, x_44.4s\n"
-        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "ldr qx_43, [%x[inptr3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "beq 3f\n"  // Jump to 'B' tail
-
-        // Start part B
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fneg BXTx13.4s,          x_33.4s\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-        "ldr qx_44, [%x[inptr3], %x[colstride1]]\n"
-        "fneg BXTx14.4s,          x_34.4s\n"
-        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "fsub BXTx44.4s, x_24.4s, x_44.4s\n"
-        "add %x[inptr3], %x[inptr3], %x[colstride2]\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "ldr qx_43, [%x[inptr3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
-
-      // ----------------------------------------------------------------------
-      "2:"
-        // 'A' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fneg AXTx13.4s,          x_33.4s\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "fsub AXTx43.4s, x_23.4s, x_43.4s\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-        "b 4f\n"  // Jump to end of function
-
-      // ----------------------------------------------------------------------
-      "3:"
-        // 'B' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fneg BXTx13.4s,          x_33.4s\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "fsub BXTx43.4s, x_23.4s, x_43.4s\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-      // ----------------------------------------------------------------------
-      "4:"
-        // End of function
-
-      // Clear names
-      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
-      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
-      ".unreq x_43\n" ".unreq qx_43\n" ".unreq x_44\n" ".unreq qx_44\n"
-      ".unreq AXTx11\n" ".unreq BXTx13\n"
-      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
-      ".unreq AXTx21\n" ".unreq BXTx23\n"
-      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
-      ".unreq AXTx31\n" ".unreq BXTx33\n"
-      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
-      ".unreq AXTx41\n" ".unreq BXTx43\n"
-      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
-      ".unreq AXTx13\n" ".unreq BXTx11\n"
-      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
-      ".unreq AXTx23\n" ".unreq BXTx21\n"
-      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
-      ".unreq AXTx33\n" ".unreq BXTx31\n"
-      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
-      ".unreq AXTx43\n" ".unreq BXTx41\n"
-      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
-      ".unreq U\n" ".unreq qU\n"
-    : [inptr1] "+r" (inptr0),  // Offset to account for padded row
-      [inptr2] "+r" (inptr1),  // Offset to account for padded row
-      [inptr3] "+r" (inptr2),  // Offset to account for padded row
-      [outptr0] "+r" (outptr0),
-      [outptr4] "+r" (outptr4),
-      [outptr8] "+r" (outptr8),
-      [outptr12] "+r" (outptr12),
-      [tile_j] "+r" (tile_j)  // Tile counter
-    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
-      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
-      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
-      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
-      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
-      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
-      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-      "v22", "v23", "v24"
-  );
-}
-
-// Pad left, right and bottom by 1.
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInput<float>::process_tile_row<0, 1, 1, 1, 4>(
-    const int tile_N,
-    const float* const input,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* const matrix,
-    const int matrix_stride,
-    const int matrix_row_stride
-) {
-  const float *inptr0 = input;
-  const float *inptr1 = input + input_row_stride;
-  const float *inptr2 = input + input_row_stride * 2;
-
-  float *outptr0 = matrix;
-  float *outptr4 = matrix + matrix_stride * 4;
-  float *outptr8 = matrix + matrix_stride * 8;
-  float *outptr12 = matrix + matrix_stride * 12;
-
-  int tile_j = tile_N;  // Tiles to process
-
-  asm volatile (
-      // Named SIMD registers according to the policy given above
-      // Registers into which to load the latter two columns of `x`
-      // NOTE: Bottom row is not required since since it is padded.
-      "x_13 .req v0\n qx_13 .req q0\n" "x_14 .req v4\n qx_14 .req q4\n"
-      "x_23 .req v1\n qx_23 .req q1\n" "x_24 .req v5\n qx_24 .req q5\n"
-      "x_33 .req v2\n qx_33 .req q2\n" "x_34 .req v6\n qx_34 .req q6\n"
-
-      // Registers for storing X.T x (both A and B halves)
-      "AXTx11 .req  v8\n" "BXTx13 .req  v8\n"
-      "AXTx12 .req  v9\n" "BXTx14 .req  v9\n" "qAXTx12 .req  q9\n"
-      "AXTx21 .req v10\n" "BXTx23 .req v10\n"
-      "AXTx22 .req v11\n" "BXTx24 .req v11\n" "qAXTx22 .req q11\n"
-      "AXTx31 .req v12\n" "BXTx33 .req v12\n"
-      "AXTx32 .req v13\n" "BXTx34 .req v13\n" "qAXTx32 .req q13\n"
-      "AXTx41 .req v14\n" "BXTx43 .req v14\n"
-      "AXTx42 .req v15\n" "BXTx44 .req v15\n" "qAXTx42 .req q15\n"
-      "AXTx13 .req v16\n" "BXTx11 .req v16\n"
-      "AXTx14 .req v17\n" "BXTx12 .req v17\n" "qBXTx12 .req q17\n"
-      "AXTx23 .req v18\n" "BXTx21 .req v18\n"
-      "AXTx24 .req v19\n" "BXTx22 .req v19\n" "qBXTx22 .req q19\n"
-      "AXTx33 .req v20\n" "BXTx31 .req v20\n"
-      "AXTx34 .req v21\n" "BXTx32 .req v21\n" "qBXTx32 .req q21\n"
-      "AXTx43 .req v22\n" "BXTx41 .req v22\n"
-      "AXTx44 .req v23\n" "BXTx42 .req v23\n" "qBXTx42 .req q23\n"
-
-      "U .req v24\n qU .req q24\n"
-
-      // ----------------------------------------------------------------------
-      // Head of loop
-      //   Loads a complete 4x4 tile of x, computes X.T x, computes and stores
-      //   `U = X.T x X`. Prepares for the 'A' half of the loop.
-      //   NOTE: Since the first tile has the leftmost column padded we can
-      //   skip 4 loads and 4 calculations for the matrix X.T x X.
-
-      // Temporarily alias registers for computing the first (non-padded)
-      // column of x.
-      "x_12 .req v0\n qx_12 .req q0\n"
-      "x_22 .req v1\n qx_22 .req q1\n"
-      "x_32 .req v2\n qx_32 .req q2\n"
-
-      "ldr qx_12, [%x[inptr0]]\n"
-      "ldr qx_22, [%x[inptr1]]\n"
-      "ldr qx_32, [%x[inptr2]]\n"
-
-      "fsub BXTx12.4s,  x_12.4s, x_32.4s\n"
-      "fadd BXTx22.4s,  x_22.4s, x_32.4s\n"
-      "fsub BXTx32.4s,  x_32.4s, x_22.4s\n"
-      "mov  BXTx42.16b, x_22.16b\n"  // Probably should do better
-
-      ".unreq x_12\n .unreq qx_12\n"
-      ".unreq x_22\n .unreq qx_22\n"
-      ".unreq x_32\n .unreq qx_32\n"
-
-      // Load and compute latter two columns of the first tile. Progress the
-      // input pointers (by three columns so that the each points are the
-      // second column of the next tile, that is, each points at the first
-      // column which must be read for the next tile.
-      "ldr qx_13, [%x[inptr0], %x[colstride1]]\n"
-      "ldr qx_23, [%x[inptr1], %x[colstride1]]\n"
-      "ldr qx_33, [%x[inptr2], %x[colstride1]]\n"
-
-      "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-      "ldr qx_14, [%x[inptr0], %x[colstride2]]\n"
-
-      "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-      "ldr qx_24, [%x[inptr1], %x[colstride2]]\n"
-
-      "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-      "ldr qx_34, [%x[inptr2], %x[colstride2]]\n"
-
-      "mov  BXTx43.16b, x_23.16b\n"
-      "fsub BXTx14.4s,  x_14.4s, x_34.4s\n"
-      "add %x[inptr0],  %x[inptr0], %x[colstride3]\n"
-
-      "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-      "add %x[inptr1], %x[inptr1], %x[colstride3]\n"
-
-      "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-      "add %x[inptr2], %x[inptr2], %x[colstride3]\n"
-
-      "mov BXTx44.16b, x_24.16b\n"
-
-      // Compute and store U for the first tile
-      // First row
-      "fneg U.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0]]\n"
-      "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-      "str qU, [%x[outptr0], %x[mstride1]]\n"
-      "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-      "str qU, [%x[outptr0], %x[mstride2]]\n"
-      "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-      "str qU, [%x[outptr0], %x[mstride3]]\n"
-      "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-      // Second row
-      "fneg U.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4]]\n"
-      "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-      "str qU, [%x[outptr4], %x[mstride1]]\n"
-      "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-      "str qU, [%x[outptr4], %x[mstride2]]\n"
-      "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-      "str qU, [%x[outptr4], %x[mstride3]]\n"
-      "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-      // Third row
-      "fneg U.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8]]\n"
-      "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-      "str qU, [%x[outptr8], %x[mstride1]]\n"
-      "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-      "str qU, [%x[outptr8], %x[mstride2]]\n"
-      "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-      "str qU, [%x[outptr8], %x[mstride3]]\n"
-      "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-      // Fourth row, simultaneously load the first column of inputs for the
-      // next tile.
-      "fneg U.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12]]\n"
-      "ldr qx_13, [%x[inptr0]]\n"
-
-      "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-      "str qU, [%x[outptr12], %x[mstride1]]\n"
-      "ldr qx_23, [%x[inptr1]]\n"
-
-      "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-      "str qU, [%x[outptr12], %x[mstride2]]\n"
-      "ldr qx_33, [%x[inptr2]]\n"
-
-      "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-      "str qU, [%x[outptr12], %x[mstride3]]\n"
-
-      "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-      // Update the loop counter, subtract two to account for both the head and
-      // the tail.
-      "subs %x[tile_j], %x[tile_j], #2\n"
-      "beq 2f\n"  // Jump to "A" tail if out of tiles
-
-      // ----------------------------------------------------------------------
-      "1:"
-        // Start part A
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
-        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "mov  AXTx43.16b, x_23.16b\n"
-
-        "fsub AXTx14.4s, x_14.4s, x_34.4s\n"
-        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
-        "fadd AXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub AXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "mov  AXTx44.16b, x_24.16b\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, AXTx12.4s, AXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, AXTx22.4s, AXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, AXTx32.4s, AXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "ldr qx_13, [%x[inptr0]]\n"
-
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, AXTx42.4s, AXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "beq 3f\n"  // Jump to 'B' tail
-
-        // Start part B
-        // Load last column of this tile (the first column has already been
-        // loaded) and compute latter two columns of X.T x.
-        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-        "ldr qx_14, [%x[inptr0], %x[colstride1]]\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "ldr qx_24, [%x[inptr1], %x[colstride1]]\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "ldr qx_34, [%x[inptr2], %x[colstride1]]\n"
-        "mov BXTx43.16b, x_23.16b\n"
-
-        "fsub BXTx14.4s, x_14.4s, x_34.4s\n"
-        "add %x[inptr0], %x[inptr0], %x[colstride2]\n"
-        "fadd BXTx24.4s, x_24.4s, x_34.4s\n"
-        "add %x[inptr1], %x[inptr1], %x[colstride2]\n"
-        "fsub BXTx34.4s, x_34.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], %x[colstride2]\n"
-        "mov BXTx44.16b, x_24.16b\n"
-
-        // Compute and store U.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, BXTx12.4s, BXTx14.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], %x[matrix_row_stride]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fsub U.4s, BXTx22.4s, BXTx24.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], %x[matrix_row_stride]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, BXTx32.4s, BXTx34.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], %x[matrix_row_stride]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "ldr qx_13, [%x[inptr0]]\n"
-
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "ldr qx_23, [%x[inptr1]]\n"
-
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "ldr qx_33, [%x[inptr2]]\n"
-
-        "fsub U.4s, BXTx42.4s, BXTx44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-
-        "add %x[outptr12], %x[outptr12], %x[matrix_row_stride]\n"
-        "subs %x[tile_j], %x[tile_j], #1\n"
-        "bne 1b\n"  // Continue loop, otherwise flow into 'A' tail
-
-      // ----------------------------------------------------------------------
-      "2:"
-        // 'A' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fsub AXTx13.4s, x_13.4s, x_33.4s\n"
-        "fadd AXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub AXTx33.4s, x_33.4s, x_23.4s\n"
-        "mov  AXTx43.16b, x_23.16b\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, AXTx11.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, AXTx12.4s, AXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, AXTx13.4s, AXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qAXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, AXTx21.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, AXTx22.4s, AXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, AXTx23.4s, AXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qAXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, AXTx31.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, AXTx32.4s, AXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, AXTx33.4s, AXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qAXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, AXTx41.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, AXTx42.4s, AXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, AXTx43.4s, AXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qAXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-        "b 4f\n"  // Jump to end of function
-
-      // ----------------------------------------------------------------------
-      "3:"
-        // 'B' tail
-        // Since the final column is padding and the last-but-one column has
-        // already been loaded just compute the 3rd column of `X.T x'.
-        "fsub BXTx13.4s, x_13.4s, x_33.4s\n"
-        "fadd BXTx23.4s, x_23.4s, x_33.4s\n"
-        "fsub BXTx33.4s, x_33.4s, x_23.4s\n"
-        "mov  BXTx43.16b, x_23.16b\n"
-
-        // Compute and store U. Modified to account for the final column of X.T
-        // x containing padding. Note, it is also unnecessary to update the
-        // output pointers.
-        // First row
-        "fsub U.4s, BXTx11.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fadd U.4s, BXTx12.4s, BXTx13.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, BXTx13.4s, BXTx12.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "str qBXTx12, [%x[outptr0], %x[mstride3]]\n"
-
-        // Second row
-        "fsub U.4s, BXTx21.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, BXTx22.4s, BXTx23.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fsub U.4s, BXTx23.4s, BXTx22.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "str qBXTx22, [%x[outptr4], %x[mstride3]]\n"
-
-        // Third row
-        "fsub U.4s, BXTx31.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fadd U.4s, BXTx32.4s, BXTx33.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, BXTx33.4s, BXTx32.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "str qBXTx32, [%x[outptr8], %x[mstride3]]\n"
-
-        // Fourth row
-        "fsub U.4s, BXTx41.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fadd U.4s, BXTx42.4s, BXTx43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, BXTx43.4s, BXTx42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "str qBXTx42, [%x[outptr12], %x[mstride3]]\n"
-
-      // ----------------------------------------------------------------------
-      "4:"
-        // End of function
-
-      // Clear names
-      ".unreq x_13\n" ".unreq qx_13\n" ".unreq x_14\n" ".unreq qx_14\n"
-      ".unreq x_23\n" ".unreq qx_23\n" ".unreq x_24\n" ".unreq qx_24\n"
-      ".unreq x_33\n" ".unreq qx_33\n" ".unreq x_34\n" ".unreq qx_34\n"
-      ".unreq AXTx11\n" ".unreq BXTx13\n"
-      ".unreq AXTx12\n" ".unreq BXTx14\n" ".unreq qAXTx12\n"
-      ".unreq AXTx21\n" ".unreq BXTx23\n"
-      ".unreq AXTx22\n" ".unreq BXTx24\n" ".unreq qAXTx22\n"
-      ".unreq AXTx31\n" ".unreq BXTx33\n"
-      ".unreq AXTx32\n" ".unreq BXTx34\n" ".unreq qAXTx32\n"
-      ".unreq AXTx41\n" ".unreq BXTx43\n"
-      ".unreq AXTx42\n" ".unreq BXTx44\n" ".unreq qAXTx42\n"
-      ".unreq AXTx13\n" ".unreq BXTx11\n"
-      ".unreq AXTx14\n" ".unreq BXTx12\n" ".unreq qBXTx12\n"
-      ".unreq AXTx23\n" ".unreq BXTx21\n"
-      ".unreq AXTx24\n" ".unreq BXTx22\n" ".unreq qBXTx22\n"
-      ".unreq AXTx33\n" ".unreq BXTx31\n"
-      ".unreq AXTx34\n" ".unreq BXTx32\n" ".unreq qBXTx32\n"
-      ".unreq AXTx43\n" ".unreq BXTx41\n"
-      ".unreq AXTx44\n" ".unreq BXTx42\n" ".unreq qBXTx42\n"
-      ".unreq U\n" ".unreq qU\n"
-    : [inptr0] "+r" (inptr0),
-      [inptr1] "+r" (inptr1),
-      [inptr2] "+r" (inptr2),
-      [outptr0] "+r" (outptr0),
-      [outptr4] "+r" (outptr4),
-      [outptr8] "+r" (outptr8),
-      [outptr12] "+r" (outptr12),
-      [tile_j] "+r" (tile_j)  // Tile counter
-    : [colstride1] "r" (1 * input_col_stride * sizeof(float)),
-      [colstride2] "r" (2 * input_col_stride * sizeof(float)),
-      [colstride3] "r" (3 * input_col_stride * sizeof(float)),
-      [mstride1] "r" (1 * matrix_stride * sizeof(float)),
-      [mstride2] "r" (2 * matrix_stride * sizeof(float)),
-      [mstride3] "r" (3 * matrix_stride * sizeof(float)),
-      [matrix_row_stride] "r" (matrix_row_stride * sizeof(float))
-    : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-      "v22", "v23", "v24"
-  );
-}
-}
-#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp
deleted file mode 100644
index ad1ad55..0000000
--- a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3/a64_float_channelwise.hpp
+++ /dev/null

@@ -1,961 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include "../input_2x2_3x3.hpp"
-
-#ifdef __aarch64__
-
-namespace winograd {
-
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 0, 0, 4>(
-    int &n_channels,  // Number of channels in the tile
-    const float* &inptr0,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* &outptr0,
-    const int matrix_stride
-) {
-  // We use 4 pointers to point to the starting position on each row and use
-  // three offsets to extract elements from each of the other 3 columns.
-  auto inptr1 = inptr0 + 1*input_row_stride;
-  auto inptr2 = inptr0 + 2*input_row_stride;
-  auto inptr3 = inptr0 + 3*input_row_stride;
-
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  auto outptr1 = outptr0 + matrix_stride * 4;
-  auto outptr2 = outptr0 + matrix_stride * 8;
-  auto outptr3 = outptr0 + matrix_stride * 12;
-
-  for (; n_channels > 3; n_channels -= 4) {
-    asm volatile (
-        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
-        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
-        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
-        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
-        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
-        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
-        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
-        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
-        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
-        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
-        "X_33 .req v10\n"  "qX_33 .req q10\n"
-        "X_34 .req v11\n"  "qX_34 .req q11\n"
-        "X_41 .req v12\n"  "qX_41 .req q12\n"
-        "X_42 .req v13\n"  "qX_42 .req q13\n"
-        "X_43 .req v14\n"  "qX_43 .req q14\n"
-        "X_44 .req v15\n"  "qX_44 .req q15\n"
-        "xX_11 .req v16\n"
-        "xX_12 .req v17\n"
-        "xX_13 .req v18\n"
-        "xX_14 .req v19\n"
-        "xX_21 .req v20\n"
-        "xX_22 .req v21\n"
-        "xX_23 .req v22\n"
-        "xX_24 .req v23\n"
-        "xX_31 .req v24\n"
-        "xX_32 .req v25\n"
-        "xX_33 .req v26\n"
-        "xX_34 .req v27\n"
-        "xX_41 .req v28\n"
-        "xX_42 .req v29\n"
-        "xX_43 .req v30\n"
-        "xX_44 .req v31\n"
-        " U .req v0\n"
-        "qU .req q0\n"
-
-        // Load the tile, and compute compute the matrix xX
-        "ldr qX_11, [%x[inptr0]]\n"
-        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
-        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
-        "ldr qX_14, [%x[inptr0], %x[colstride3]]\n"
-        "add %x[inptr0], %x[inptr0], #0x10\n"
-
-        "ldr qX_21, [%x[inptr1]]\n"
-        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
-        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
-        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
-        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
-        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
-        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
-        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
-        "add %x[inptr1], %x[inptr1], #0x10\n"
-
-        "ldr qX_31, [%x[inptr2]]\n"
-        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
-        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
-        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
-        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
-        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
-        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
-        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], #0x10\n"
-
-        "ldr qX_41, [%x[inptr3]]\n"
-        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
-        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
-        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
-        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
-        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
-        "ldr qX_44, [%x[inptr3], %x[colstride3]]\n"
-        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
-        "add %x[inptr3], %x[inptr3], #0x10\n"
-
-        // Complete computing xX while beginning to compute and store
-        // $U = X.T x X$
-
-        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
-
-        "fsub U.4s, xX_11.4s, xX_31.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fsub U.4s, xX_12.4s, xX_32.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, xX_13.4s, xX_33.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, xX_14.4s, xX_34.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], #0x10\n"
-
-        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
-
-        "fadd U.4s, xX_21.4s, xX_31.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, xX_22.4s, xX_32.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fadd U.4s, xX_23.4s, xX_33.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fadd U.4s, xX_24.4s, xX_34.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], #0x10\n"
-
-        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
-
-        "fsub U.4s, xX_31.4s, xX_21.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fsub U.4s, xX_32.4s, xX_22.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, xX_33.4s, xX_23.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, xX_34.4s, xX_24.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], #0x10\n"
-
-        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
-
-        "fsub U.4s, xX_21.4s, xX_41.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fsub U.4s, xX_22.4s, xX_42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, xX_23.4s, xX_43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "fsub U.4s, xX_24.4s, xX_44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "add %x[outptr12], %x[outptr12], #0x10\n"
-
-        ".unreq qU\n"
-        ".unreq U\n"
-        ".unreq X_11\n"  ".unreq qX_11\n"
-        ".unreq X_12\n"  ".unreq qX_12\n"
-        ".unreq X_13\n"  ".unreq qX_13\n"
-        ".unreq X_14\n"  ".unreq qX_14\n"
-        ".unreq X_21\n"  ".unreq qX_21\n"
-        ".unreq X_22\n"  ".unreq qX_22\n"
-        ".unreq X_23\n"  ".unreq qX_23\n"
-        ".unreq X_24\n"  ".unreq qX_24\n"
-        ".unreq X_31\n"  ".unreq qX_31\n"
-        ".unreq X_32\n"  ".unreq qX_32\n"
-        ".unreq X_33\n"  ".unreq qX_33\n"
-        ".unreq X_34\n"  ".unreq qX_34\n"
-        ".unreq X_41\n"  ".unreq qX_41\n"
-        ".unreq X_42\n"  ".unreq qX_42\n"
-        ".unreq X_43\n"  ".unreq qX_43\n"
-        ".unreq X_44\n"  ".unreq qX_44\n"
-        ".unreq xX_11\n"
-        ".unreq xX_12\n"
-        ".unreq xX_13\n"
-        ".unreq xX_14\n"
-        ".unreq xX_21\n"
-        ".unreq xX_22\n"
-        ".unreq xX_23\n"
-        ".unreq xX_24\n"
-        ".unreq xX_31\n"
-        ".unreq xX_32\n"
-        ".unreq xX_33\n"
-        ".unreq xX_34\n"
-        ".unreq xX_41\n"
-        ".unreq xX_42\n"
-        ".unreq xX_43\n"
-        ".unreq xX_44\n"
-
-        : [inptr0] "+r" (inptr0),
-          [inptr1] "+r" (inptr1),
-          [inptr2] "+r" (inptr2),
-          [inptr3] "+r" (inptr3),
-          [outptr0] "+r" (outptr0),
-          [outptr4] "+r" (outptr1),
-          [outptr8] "+r" (outptr2),
-          [outptr12] "+r" (outptr3)
-        : [colstride1] "r" (input_col_stride * sizeof(float)),
-          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
-          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
-          [mstride1] "r" (matrix_stride * sizeof(float)),
-          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
-          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
-        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-          "v30", "v31"
-    );
-  }
-}
-
-// Pad top by 1
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<1, 0, 0, 0, 4>(
-    int &n_channels,  // Number of channels in the tile
-    const float* &inptr0,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* &outptr0,
-    const int matrix_stride
-) {
-  // We use 4 pointers to point to the starting position on each row and use
-  // three offsets to extract elements from each of the other 3 columns.
-  auto inptr1 = inptr0 + 0*input_row_stride;
-  auto inptr2 = inptr0 + 1*input_row_stride;
-
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  auto outptr1 = outptr0 + matrix_stride * 4;
-  auto outptr2 = outptr0 + matrix_stride * 8;
-  auto outptr3 = outptr0 + matrix_stride * 12;
-
-  for (; n_channels > 3; n_channels -= 4) {
-    asm volatile (
-        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
-        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
-        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
-        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
-        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
-        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
-        "X_33 .req v10\n"  "qX_33 .req q10\n"
-        "X_34 .req v11\n"  "qX_34 .req q11\n"
-        "X_41 .req v12\n"  "qX_41 .req q12\n"
-        "X_42 .req v13\n"  "qX_42 .req q13\n"
-        "X_43 .req v14\n"  "qX_43 .req q14\n"
-        "X_44 .req v15\n"  "qX_44 .req q15\n"
-        "xX_21 .req v20\n"
-        "xX_22 .req v21\n"
-        "xX_23 .req v22\n"
-        "xX_24 .req v23\n"
-        "xX_31 .req v24\n"
-        "xX_32 .req v25\n"
-        "xX_33 .req v26\n"
-        "xX_34 .req v27\n"
-        "xX_41 .req v28\n"
-        "xX_42 .req v29\n"
-        "xX_43 .req v30\n"
-        "xX_44 .req v31\n"
-        " U .req v0\n"
-        "qU .req q0\n"
-
-        // Load the tile, and compute compute the matrix xX
-        "ldr qX_21, [%x[inptr1]]\n"
-        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
-        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
-        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
-        "add %x[inptr1], %x[inptr1], #0x10\n"
-
-        "ldr qX_31, [%x[inptr2]]\n"
-        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
-        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
-        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
-        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
-        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
-        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
-        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], #0x10\n"
-
-        "ldr qX_41, [%x[inptr3]]\n"
-        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
-        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
-        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
-        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
-        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
-        "ldr qX_44, [%x[inptr3], %x[colstride3]]\n"
-        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
-        "add %x[inptr3], %x[inptr3], #0x10\n"
-
-        // Complete computing xX while beginning to compute and store
-        // $U = X.T x X$
-
-        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
-
-        "fneg U.4s, xX_31.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fneg U.4s, xX_32.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fneg U.4s, xX_33.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fneg U.4s, xX_34.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], #0x10\n"
-
-        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
-
-        "fadd U.4s, xX_21.4s, xX_31.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, xX_22.4s, xX_32.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fadd U.4s, xX_23.4s, xX_33.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fadd U.4s, xX_24.4s, xX_34.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], #0x10\n"
-
-        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
-
-        "fsub U.4s, xX_31.4s, xX_21.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fsub U.4s, xX_32.4s, xX_22.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, xX_33.4s, xX_23.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, xX_34.4s, xX_24.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], #0x10\n"
-
-        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
-
-        "fsub U.4s, xX_21.4s, xX_41.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fsub U.4s, xX_22.4s, xX_42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, xX_23.4s, xX_43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "fsub U.4s, xX_24.4s, xX_44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "add %x[outptr12], %x[outptr12], #0x10\n"
-
-        ".unreq qU\n"
-        ".unreq U\n"
-        ".unreq X_21\n"  ".unreq qX_21\n"
-        ".unreq X_22\n"  ".unreq qX_22\n"
-        ".unreq X_23\n"  ".unreq qX_23\n"
-        ".unreq X_24\n"  ".unreq qX_24\n"
-        ".unreq X_31\n"  ".unreq qX_31\n"
-        ".unreq X_32\n"  ".unreq qX_32\n"
-        ".unreq X_33\n"  ".unreq qX_33\n"
-        ".unreq X_34\n"  ".unreq qX_34\n"
-        ".unreq X_41\n"  ".unreq qX_41\n"
-        ".unreq X_42\n"  ".unreq qX_42\n"
-        ".unreq X_43\n"  ".unreq qX_43\n"
-        ".unreq X_44\n"  ".unreq qX_44\n"
-        ".unreq xX_21\n"
-        ".unreq xX_22\n"
-        ".unreq xX_23\n"
-        ".unreq xX_24\n"
-        ".unreq xX_31\n"
-        ".unreq xX_32\n"
-        ".unreq xX_33\n"
-        ".unreq xX_34\n"
-        ".unreq xX_41\n"
-        ".unreq xX_42\n"
-        ".unreq xX_43\n"
-        ".unreq xX_44\n"
-
-        : [inptr1] "+r" (inptr0),  // Offset for missing row
-          [inptr2] "+r" (inptr1),  // Offset for missing row
-          [inptr3] "+r" (inptr2),  // Offset for missing row
-          [outptr0] "+r" (outptr0),
-          [outptr4] "+r" (outptr1),
-          [outptr8] "+r" (outptr2),
-          [outptr12] "+r" (outptr3)
-        : [colstride1] "r" (input_col_stride * sizeof(float)),
-          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
-          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
-          [mstride1] "r" (matrix_stride * sizeof(float)),
-          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
-          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
-        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-          "v30", "v31"
-    );
-  }
-}
-
-// Pad left by 1
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 1, 0, 0, 4>(
-    int &n_channels,  // Number of channels in the tile
-    const float* &inptr0,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* &outptr0,
-    const int matrix_stride
-) {
-  // We use 4 pointers to point to the starting position on each row and use
-  // three offsets to extract elements from each of the other 3 columns.
-  auto inptr1 = inptr0 + 1*input_row_stride;
-  auto inptr2 = inptr0 + 2*input_row_stride;
-  auto inptr3 = inptr0 + 3*input_row_stride;
-
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  auto outptr1 = outptr0 + matrix_stride * 4;
-  auto outptr2 = outptr0 + matrix_stride * 8;
-  auto outptr3 = outptr0 + matrix_stride * 12;
-
-  for (; n_channels > 3; n_channels -= 4) {
-    asm volatile (
-        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
-        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
-        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
-        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
-        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
-        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
-        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
-        "X_33 .req v10\n"  "qX_33 .req q10\n"
-        "X_34 .req v11\n"  "qX_34 .req q11\n"
-        "X_42 .req v13\n"  "qX_42 .req q13\n"
-        "X_43 .req v14\n"  "qX_43 .req q14\n"
-        "X_44 .req v15\n"  "qX_44 .req q15\n"
-        "xX_11 .req v16\n"
-        "xX_12 .req v17\n"
-        "xX_13 .req v18\n"
-        "xX_14 .req v19\n"
-        "xX_21 .req v20\n"
-        "xX_22 .req v21\n"
-        "xX_23 .req v22\n"
-        "xX_24 .req v23\n"
-        "xX_31 .req v24\n"
-        "xX_32 .req v25\n"
-        "xX_33 .req v26\n"
-        "xX_34 .req v27\n"
-        "xX_41 .req v28\n"
-        "xX_42 .req v29\n"
-        "xX_43 .req v30\n"
-        "xX_44 .req v31\n"
-        " U .req v0\n"
-        "qU .req q0\n"
-
-        // Load the tile, and compute compute the matrix xX
-        "ldr qX_12, [%x[inptr0]]\n"
-        "ldr qX_13, [%x[inptr0], %x[colstride1]]\n"
-        "ldr qX_14, [%x[inptr0], %x[colstride2]]\n"
-        "add %x[inptr0], %x[inptr0], #0x10\n"
-
-        "fneg xX_11.4s, x_13.4s\n"
-        "ldr qX_22, [%x[inptr1]]\n"
-        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
-        "ldr qX_23, [%x[inptr1], %x[colstride1]]\n"
-        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
-        "ldr qX_24, [%x[inptr1], %x[colstride2]]\n"
-        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
-        "add %x[inptr1], %x[inptr1], #0x10\n"
-
-        "fneg xX_21.4s, x_23.4s\n"
-        "ldr qX_32, [%x[inptr2]]\n"
-        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
-        "ldr qX_33, [%x[inptr2], %x[colstride1]]\n"
-        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
-        "ldr qX_34, [%x[inptr2], %x[colstride2]]\n"
-        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], #0x10\n"
-
-        "fneg xX_31.4s, x_33.4s\n"
-        "ldr qX_42, [%x[inptr3]]\n"
-        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
-        "ldr qX_43, [%x[inptr3], %x[colstride1]]\n"
-        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
-        "ldr qX_44, [%x[inptr3], %x[colstride2]]\n"
-        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
-        "add %x[inptr3], %x[inptr3], #0x10\n"
-
-        // Complete computing xX while beginning to compute and store
-        // $U = X.T x X$
-
-        "fneg xX_41.4s, x_43.4s\n"
-
-        "fsub U.4s, xX_11.4s, xX_31.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fsub U.4s, xX_12.4s, xX_32.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, xX_13.4s, xX_33.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, xX_14.4s, xX_34.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], #0x10\n"
-
-        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
-
-        "fadd U.4s, xX_21.4s, xX_31.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, xX_22.4s, xX_32.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fadd U.4s, xX_23.4s, xX_33.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fadd U.4s, xX_24.4s, xX_34.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], #0x10\n"
-
-        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
-
-        "fsub U.4s, xX_31.4s, xX_21.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fsub U.4s, xX_32.4s, xX_22.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, xX_33.4s, xX_23.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, xX_34.4s, xX_24.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], #0x10\n"
-
-        "fsub xX_44.4s, x_42.4s, x_44.4s\n"
-
-        "fsub U.4s, xX_21.4s, xX_41.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fsub U.4s, xX_22.4s, xX_42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, xX_23.4s, xX_43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "fsub U.4s, xX_24.4s, xX_44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "add %x[outptr12], %x[outptr12], #0x10\n"
-
-        ".unreq X_12\n"  ".unreq qX_12\n"
-        ".unreq X_13\n"  ".unreq qX_13\n"
-        ".unreq X_14\n"  ".unreq qX_14\n"
-        ".unreq X_22\n"  ".unreq qX_22\n"
-        ".unreq X_23\n"  ".unreq qX_23\n"
-        ".unreq X_24\n"  ".unreq qX_24\n"
-        ".unreq X_32\n"  ".unreq qX_32\n"
-        ".unreq X_33\n"  ".unreq qX_33\n"
-        ".unreq X_34\n"  ".unreq qX_34\n"
-        ".unreq X_42\n"  ".unreq qX_42\n"
-        ".unreq X_43\n"  ".unreq qX_43\n"
-        ".unreq X_44\n"  ".unreq qX_44\n"
-        ".unreq xX_11\n"
-        ".unreq xX_12\n"
-        ".unreq xX_13\n"
-        ".unreq xX_14\n"
-        ".unreq xX_21\n"
-        ".unreq xX_22\n"
-        ".unreq xX_23\n"
-        ".unreq xX_24\n"
-        ".unreq xX_31\n"
-        ".unreq xX_32\n"
-        ".unreq xX_33\n"
-        ".unreq xX_34\n"
-        ".unreq xX_41\n"
-        ".unreq xX_42\n"
-        ".unreq xX_43\n"
-        ".unreq xX_44\n"
-        ".unreq U\n"
-        ".unreq qU\n"
-
-        : [inptr0] "+r" (inptr0),
-          [inptr1] "+r" (inptr1),
-          [inptr2] "+r" (inptr2),
-          [inptr3] "+r" (inptr3),
-          [outptr0] "+r" (outptr0),
-          [outptr4] "+r" (outptr1),
-          [outptr8] "+r" (outptr2),
-          [outptr12] "+r" (outptr3)
-        : [colstride1] "r" (input_col_stride * sizeof(float)),
-          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
-          [mstride1] "r" (matrix_stride * sizeof(float)),
-          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
-          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
-        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-          "v30", "v31"
-    );
-  }
-}
-
-// Pad bottom by 1
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 1, 0, 4>(
-    int &n_channels,  // Number of channels in the tile
-    const float* &inptr0,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* &outptr0,
-    const int matrix_stride
-) {
-  // We use 4 pointers to point to the starting position on each row and use
-  // three offsets to extract elements from each of the other 3 columns.
-  auto inptr1 = inptr0 + 1*input_row_stride;
-  auto inptr2 = inptr0 + 2*input_row_stride;
-
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  auto outptr1 = outptr0 + matrix_stride * 4;
-  auto outptr2 = outptr0 + matrix_stride * 8;
-  auto outptr3 = outptr0 + matrix_stride * 12;
-
-  for (; n_channels > 3; n_channels -= 4) {
-    asm volatile (
-        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
-        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
-        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
-        "X_14 .req  v3\n"  "qX_14 .req  q3\n"
-        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
-        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
-        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
-        "X_24 .req  v7\n"  "qX_24 .req  q7\n"
-        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
-        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
-        "X_33 .req v10\n"  "qX_33 .req q10\n"
-        "X_34 .req v11\n"  "qX_34 .req q11\n"
-        "xX_11 .req v16\n"
-        "xX_12 .req v17\n"
-        "xX_13 .req v18\n"
-        "xX_14 .req v19\n"
-        "xX_21 .req v20\n" "qxX_21 .req q20\n"
-        "xX_22 .req v21\n" "qxX_22 .req q21\n"
-        "xX_23 .req v22\n" "qxX_23 .req q22\n"
-        "xX_24 .req v23\n" "qxX_24 .req q23\n"
-        "xX_31 .req v24\n"
-        "xX_32 .req v25\n"
-        "xX_33 .req v26\n"
-        "xX_34 .req v27\n"
-        " U .req v0\n"
-        "qU .req q0\n"
-
-        // Load the tile, and compute compute the matrix xX
-        "ldr qX_11, [%x[inptr0]]\n"
-        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
-        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
-        "ldr qX_14, [%x[inptr0], %x[colstride3]]\n"
-        "add %x[inptr0], %x[inptr0], #0x10\n"
-
-        "ldr qX_21, [%x[inptr1]]\n"
-        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
-        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
-        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
-        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
-        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
-        "ldr qX_24, [%x[inptr1], %x[colstride3]]\n"
-        "fsub xX_14.4s, x_12.4s, x_14.4s\n"
-        "add %x[inptr1], %x[inptr1], #0x10\n"
-
-        "ldr qX_31, [%x[inptr2]]\n"
-        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
-        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
-        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
-        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
-        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
-        "ldr qX_34, [%x[inptr2], %x[colstride3]]\n"
-        "fsub xX_24.4s, x_22.4s, x_24.4s\n"
-        "add %x[inptr2], %x[inptr2], #0x10\n"
-
-        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
-        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
-        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
-        "fsub xX_34.4s, x_32.4s, x_34.4s\n"
-
-        // Complete computing xX while beginning to compute and store
-        // $U = X.T x X$
-
-        "fsub U.4s, xX_11.4s, xX_31.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fsub U.4s, xX_12.4s, xX_32.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, xX_13.4s, xX_33.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, xX_14.4s, xX_34.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], #0x10\n"
-
-        "fadd U.4s, xX_21.4s, xX_31.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, xX_22.4s, xX_32.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fadd U.4s, xX_23.4s, xX_33.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fadd U.4s, xX_24.4s, xX_34.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], #0x10\n"
-
-        "fsub U.4s, xX_31.4s, xX_21.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fsub U.4s, xX_32.4s, xX_22.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, xX_33.4s, xX_23.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, xX_34.4s, xX_24.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], #0x10\n"
-
-        "str qxX_21, [%x[outptr12]]\n"
-        "str qxX_22, [%x[outptr12], %x[mstride1]]\n"
-        "str qxX_23, [%x[outptr12], %x[mstride2]]\n"
-        "str qxX_24, [%x[outptr12], %x[mstride3]]\n"
-        "add %x[outptr12], %x[outptr12], #0x10\n"
-
-        ".unreq qU\n"
-        ".unreq U\n"
-        ".unreq X_11\n"  ".unreq qX_11\n"
-        ".unreq X_12\n"  ".unreq qX_12\n"
-        ".unreq X_13\n"  ".unreq qX_13\n"
-        ".unreq X_14\n"  ".unreq qX_14\n"
-        ".unreq X_21\n"  ".unreq qX_21\n"
-        ".unreq X_22\n"  ".unreq qX_22\n"
-        ".unreq X_23\n"  ".unreq qX_23\n"
-        ".unreq X_24\n"  ".unreq qX_24\n"
-        ".unreq X_31\n"  ".unreq qX_31\n"
-        ".unreq X_32\n"  ".unreq qX_32\n"
-        ".unreq X_33\n"  ".unreq qX_33\n"
-        ".unreq X_34\n"  ".unreq qX_34\n"
-        ".unreq xX_11\n"
-        ".unreq xX_12\n"
-        ".unreq xX_13\n"
-        ".unreq xX_14\n"
-        ".unreq xX_21\n" ".unreq qxX_21\n"
-        ".unreq xX_22\n" ".unreq qxX_22\n"
-        ".unreq xX_23\n" ".unreq qxX_23\n"
-        ".unreq xX_24\n" ".unreq qxX_24\n"
-        ".unreq xX_31\n"
-        ".unreq xX_32\n"
-        ".unreq xX_33\n"
-        ".unreq xX_34\n"
-
-        : [inptr0] "+r" (inptr0),
-          [inptr1] "+r" (inptr1),
-          [inptr2] "+r" (inptr2),
-          [outptr0] "+r" (outptr0),
-          [outptr4] "+r" (outptr1),
-          [outptr8] "+r" (outptr2),
-          [outptr12] "+r" (outptr3)
-        : [colstride1] "r" (input_col_stride * sizeof(float)),
-          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
-          [colstride3] "r" (input_col_stride * sizeof(float) * 3),
-          [mstride1] "r" (matrix_stride * sizeof(float)),
-          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
-          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
-        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-          "v30", "v31"
-    );
-  }
-}
-
-// Pad right by 1
-template <>
-template <>
-inline void Winograd2x2_3x3GemmInputChannelwise<float>::_process_tile<0, 0, 0, 1, 4>(
-    int &n_channels,  // Number of channels in the tile
-    const float* &inptr0,
-    const int input_row_stride,
-    const int input_col_stride,
-    float* &outptr0,
-    const int matrix_stride
-) {
-  // We use 4 pointers to point to the starting position on each row and use
-  // three offsets to extract elements from each of the other 3 columns.
-  auto inptr1 = inptr0 + 1*input_row_stride;
-  auto inptr2 = inptr0 + 2*input_row_stride;
-  auto inptr3 = inptr0 + 3*input_row_stride;
-
-  // We use 4 pointers to point at matrices 0, 4, 8 and 12 and use three
-  // offsets to access the intermediate matrices.
-  auto outptr1 = outptr0 + matrix_stride * 4;
-  auto outptr2 = outptr0 + matrix_stride * 8;
-  auto outptr3 = outptr0 + matrix_stride * 12;
-
-  for (; n_channels > 3; n_channels -= 4) {
-    asm volatile (
-        "X_11 .req  v0\n"  "qX_11 .req  q0\n"
-        "X_12 .req  v1\n"  "qX_12 .req  q1\n"
-        "X_13 .req  v2\n"  "qX_13 .req  q2\n"
-        "X_21 .req  v4\n"  "qX_21 .req  q4\n"
-        "X_22 .req  v5\n"  "qX_22 .req  q5\n"
-        "X_23 .req  v6\n"  "qX_23 .req  q6\n"
-        "X_31 .req  v8\n"  "qX_31 .req  q8\n"
-        "X_32 .req  v9\n"  "qX_32 .req  q9\n"
-        "X_33 .req v10\n"  "qX_33 .req q10\n"
-        "X_41 .req v12\n"  "qX_41 .req q12\n"
-        "X_42 .req v13\n"  "qX_42 .req q13\n"
-        "X_43 .req v14\n"  "qX_43 .req q14\n"
-        "xX_11 .req v16\n"
-        "xX_12 .req v17\n"
-        "xX_13 .req v18\n"
-        "xX_14 .req x_12\n"
-        "xX_21 .req v20\n"
-        "xX_22 .req v21\n"
-        "xX_23 .req v22\n"
-        "xX_24 .req x_22\n"
-        "xX_31 .req v24\n"
-        "xX_32 .req v25\n"
-        "xX_33 .req v26\n"
-        "xX_34 .req x_32\n"
-        "xX_41 .req v28\n"
-        "xX_42 .req v29\n"
-        "xX_43 .req v30\n"
-        "xX_44 .req x_42\n"
-        " U .req v0\n"
-        "qU .req q0\n"
-
-        // Load the tile, and compute compute the matrix xX
-        "ldr qX_11, [%x[inptr0]]\n"
-        "ldr qX_12, [%x[inptr0], %x[colstride1]]\n"
-        "ldr qX_13, [%x[inptr0], %x[colstride2]]\n"
-        "add %x[inptr0], %x[inptr0], #0x10\n"
-
-        "ldr qX_21, [%x[inptr1]]\n"
-        "fsub xX_11.4s, x_11.4s, x_13.4s\n"
-        "ldr qX_22, [%x[inptr1], %x[colstride1]]\n"
-        "fadd xX_12.4s, x_12.4s, x_13.4s\n"
-        "ldr qX_23, [%x[inptr1], %x[colstride2]]\n"
-        "fsub xX_13.4s, x_13.4s, x_12.4s\n"
-        "add %x[inptr1], %x[inptr1], #0x10\n"
-
-        "ldr qX_31, [%x[inptr2]]\n"
-        "fsub xX_21.4s, x_21.4s, x_23.4s\n"
-        "ldr qX_32, [%x[inptr2], %x[colstride1]]\n"
-        "fadd xX_22.4s, x_22.4s, x_23.4s\n"
-        "ldr qX_33, [%x[inptr2], %x[colstride2]]\n"
-        "fsub xX_23.4s, x_23.4s, x_22.4s\n"
-        "add %x[inptr2], %x[inptr2], #0x10\n"
-
-        "ldr qX_41, [%x[inptr3]]\n"
-        "fsub xX_31.4s, x_31.4s, x_33.4s\n"
-        "ldr qX_42, [%x[inptr3], %x[colstride1]]\n"
-        "fadd xX_32.4s, x_32.4s, x_33.4s\n"
-        "ldr qX_43, [%x[inptr3], %x[colstride2]]\n"
-        "fsub xX_33.4s, x_33.4s, x_32.4s\n"
-        "add %x[inptr3], %x[inptr3], #0x10\n"
-
-        // Complete computing xX while beginning to compute and store
-        // $U = X.T x X$
-
-        "fsub xX_41.4s, x_41.4s, x_43.4s\n"
-
-        "fsub U.4s, xX_11.4s, xX_31.4s\n"
-        "str qU, [%x[outptr0]]\n"
-        "fsub U.4s, xX_12.4s, xX_32.4s\n"
-        "str qU, [%x[outptr0], %x[mstride1]]\n"
-        "fsub U.4s, xX_13.4s, xX_33.4s\n"
-        "str qU, [%x[outptr0], %x[mstride2]]\n"
-        "fsub U.4s, xX_14.4s, xX_34.4s\n"
-        "str qU, [%x[outptr0], %x[mstride3]]\n"
-        "add %x[outptr0], %x[outptr0], #0x10\n"
-
-        "fadd xX_42.4s, x_42.4s, x_43.4s\n"
-
-        "fadd U.4s, xX_21.4s, xX_31.4s\n"
-        "str qU, [%x[outptr4]]\n"
-        "fadd U.4s, xX_22.4s, xX_32.4s\n"
-        "str qU, [%x[outptr4], %x[mstride1]]\n"
-        "fadd U.4s, xX_23.4s, xX_33.4s\n"
-        "str qU, [%x[outptr4], %x[mstride2]]\n"
-        "fadd U.4s, xX_24.4s, xX_34.4s\n"
-        "str qU, [%x[outptr4], %x[mstride3]]\n"
-        "add %x[outptr4], %x[outptr4], #0x10\n"
-
-        "fsub xX_43.4s, x_43.4s, x_42.4s\n"
-
-        "fsub U.4s, xX_31.4s, xX_21.4s\n"
-        "str qU, [%x[outptr8]]\n"
-        "fsub U.4s, xX_32.4s, xX_22.4s\n"
-        "str qU, [%x[outptr8], %x[mstride1]]\n"
-        "fsub U.4s, xX_33.4s, xX_23.4s\n"
-        "str qU, [%x[outptr8], %x[mstride2]]\n"
-        "fsub U.4s, xX_34.4s, xX_24.4s\n"
-        "str qU, [%x[outptr8], %x[mstride3]]\n"
-        "add %x[outptr8], %x[outptr8], #0x10\n"
-
-        "fsub U.4s, xX_21.4s, xX_41.4s\n"
-        "str qU, [%x[outptr12]]\n"
-        "fsub U.4s, xX_22.4s, xX_42.4s\n"
-        "str qU, [%x[outptr12], %x[mstride1]]\n"
-        "fsub U.4s, xX_23.4s, xX_43.4s\n"
-        "str qU, [%x[outptr12], %x[mstride2]]\n"
-        "fsub U.4s, xX_24.4s, xX_44.4s\n"
-        "str qU, [%x[outptr12], %x[mstride3]]\n"
-        "add %x[outptr12], %x[outptr12], #0x10\n"
-
-        ".unreq qU\n"
-        ".unreq U\n"
-        ".unreq X_11\n"  ".unreq qX_11\n"
-        ".unreq X_12\n"  ".unreq qX_12\n"
-        ".unreq X_13\n"  ".unreq qX_13\n"
-        ".unreq X_21\n"  ".unreq qX_21\n"
-        ".unreq X_22\n"  ".unreq qX_22\n"
-        ".unreq X_23\n"  ".unreq qX_23\n"
-        ".unreq X_31\n"  ".unreq qX_31\n"
-        ".unreq X_32\n"  ".unreq qX_32\n"
-        ".unreq X_33\n"  ".unreq qX_33\n"
-        ".unreq X_41\n"  ".unreq qX_41\n"
-        ".unreq X_42\n"  ".unreq qX_42\n"
-        ".unreq X_43\n"  ".unreq qX_43\n"
-        ".unreq xX_11\n"
-        ".unreq xX_12\n"
-        ".unreq xX_13\n"
-        ".unreq xX_14\n"
-        ".unreq xX_21\n"
-        ".unreq xX_22\n"
-        ".unreq xX_23\n"
-        ".unreq xX_24\n"
-        ".unreq xX_31\n"
-        ".unreq xX_32\n"
-        ".unreq xX_33\n"
-        ".unreq xX_34\n"
-        ".unreq xX_41\n"
-        ".unreq xX_42\n"
-        ".unreq xX_43\n"
-        ".unreq xX_44\n"
-
-        : [inptr0] "+r" (inptr0),
-          [inptr1] "+r" (inptr1),
-          [inptr2] "+r" (inptr2),
-          [inptr3] "+r" (inptr3),
-          [outptr0] "+r" (outptr0),
-          [outptr4] "+r" (outptr1),
-          [outptr8] "+r" (outptr2),
-          [outptr12] "+r" (outptr3)
-        : [colstride1] "r" (input_col_stride * sizeof(float)),
-          [colstride2] "r" (input_col_stride * sizeof(float) * 2),
-          [mstride1] "r" (matrix_stride * sizeof(float)),
-          [mstride2] "r" (matrix_stride * sizeof(float) * 2),
-          [mstride3] "r" (matrix_stride * sizeof(float) * 3)
-        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-          "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
-          "v30", "v31"
-    );
-  }
-}
-}
-#endif

diff --git a/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp
new file mode 100644
index 0000000..381ae92
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/input_2x2_3x3_fp32.cpp

@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "transforms/input.hpp"
+#include "winograd_gemm.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+using Transform = WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+
+/******************************************************************************
+ * Cost methods for the input transform.
+ * =====================================
+ */
+template <>
+template <>
+int Transform::ops_performed(const Tensor4DShape &input_shape)
+{
+  // NOTE: Cost in FLOPs rather than instructions or uops.
+  const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
+  const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
+  return 16 * 16 * tile_M * tile_N * input_shape.n_channels;
+}
+/*****************************************************************************/
+
+/*****************************************************************************
+* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a
+* variety of padding types. For example, tiles at the top and left of an image
+* can require one row or column of padding on their top and left sides if the
+* padding type is SAME (where X represents a padded value):
+*
+*      _______    _______
+*     |X X X X|  |X X X X|
+*     |X      |  |       |   . . .
+*     |X      |  |       |
+*     |X______|  |_______|
+*      _______
+*     |X      |             .
+*     |X      |   . . .       .
+*     |X      |                 .
+*     |X______|
+*
+* For tiles near the right or bottom of the image it is more complicated.  Such
+* tiles might require padding by 0 or 1 rows or columns if the padding type is
+* VALID or 1 or 2 rows or columns if the padding type is SAME:
+*
+*      _______    _______    _______    _______
+*     |X X X X|  |X X X X|  |X X X X|  |X X X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X______|  |_______|  |______X|  |____X_X|
+*      _______    _______    _______    _______
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X______|  |_______|  |______X|  |____X_X|
+*      _______    _______    _______    _______
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X_X_X_X|  |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
+*      _______    _______    _______    _______
+*     |X      |  |       |  |      X|  |    X X|
+*     |X      |  |       |  |      X|  |    X X|
+*     |X X X X|  |X X X X|  |X X X X|  |X X X X|
+*     |X_X_X_X|  |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
+*
+* Additional tiles are required for especially small input images.
+*
+* Build an array of the specialised methods that deal with each of the
+* different padding combinations which may be required. These padding
+* constraints are the space:
+*
+*     Padding top in {0, 1}
+*     Padding left in {0, 1}
+*     Padding bottom in {0, 1, 2}
+*     Padding right in {0, 1, 2}
+*/
+template <>
+template <>
+template <int pad_top, int pad_left, int pad_bottom, int pad_right>
+void Transform::process_tile(
+  int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* const matrix_base,
+  const int matrix_stride
+)
+{
+  constexpr int inner_tile_i = 4, inner_tile_j = 4;
+  constexpr int cells_i = inner_tile_i - pad_bottom;
+  constexpr int cells_j = inner_tile_i - pad_right;
+
+  float *outptr = matrix_base;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[inner_tile_i][inner_tile_j];
+  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[inner_tile_i][inner_tile_j];
+  float XTx[inner_tile_i][inner_tile_j];
+  float U[inner_tile_i][inner_tile_j];
+
+  for (int i = 0; i < inner_tile_i; i++)
+  {
+    for (int j = 0; j < inner_tile_j; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used/computed in this kernel.
+    float32x4_t x[inner_tile_i][inner_tile_j];
+    float32x4_t XTx[inner_tile_i][inner_tile_j];
+    float32x4_t U[inner_tile_i][inner_tile_j];
+
+    for (int i = 0; i < inner_tile_i; i++)
+    {
+      for (int j = 0; j < inner_tile_j; j++)
+      {
+        x[i][j] = vdupq_n_f32(0.0f);
+        XTx[i][j] = vdupq_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1q_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 4;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsubq_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vaddq_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsubq_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsubq_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_i; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsubq_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vaddq_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsubq_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsubq_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_i; i++)
+    {
+      for (int j = 0; j < inner_tile_j; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 4;
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel.
+    float32x2_t x[inner_tile_i][inner_tile_j];
+    float32x2_t XTx[inner_tile_i][inner_tile_j];
+    float32x2_t U[inner_tile_i][inner_tile_j];
+
+    for (int i = 0; i < inner_tile_i; i++)
+    {
+      for (int j = 0; j < inner_tile_j; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Load x
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] = x[0][j] - x[2][j];
+      XTx[0][j] = vsub_f32(x[0][j], x[2][j]);
+
+      // XTx[1][j] = x[1][j] + x[2][j];
+      XTx[1][j] = vadd_f32(x[1][j], x[2][j]);
+
+      // XTx[2][j] = x[2][j] - x[1][j];
+      XTx[2][j] = vsub_f32(x[2][j], x[1][j]);
+
+      // XTx[3][j] = x[1][j] - x[3][j];
+      XTx[3][j] = vsub_f32(x[1][j], x[3][j]);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_i; i++)
+    {
+      // U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][0] = vsub_f32(XTx[i][0], XTx[i][2]);
+
+      // U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][1] = vadd_f32(XTx[i][1], XTx[i][2]);
+
+      // U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][2] = vsub_f32(XTx[i][2], XTx[i][1]);
+
+      // U[i][3] = XTx[i][1] - XTx[i][3];
+      U[i][3] = vsub_f32(XTx[i][1], XTx[i][3]);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_i; i++)
+    {
+      for (int j = 0; j < inner_tile_j; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      XTx[0][j] = x[0][j] - x[2][j];
+      XTx[1][j] = x[1][j] + x[2][j];
+      XTx[2][j] = x[2][j] - x[1][j];
+      XTx[3][j] = x[1][j] - x[3][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < inner_tile_i; i++)
+    {
+      U[i][0] = XTx[i][0] - XTx[i][2];
+      U[i][1] = XTx[i][1] + XTx[i][2];
+      U[i][2] = XTx[i][2] - XTx[i][1];
+      U[i][3] = XTx[i][1] - XTx[i][3];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < inner_tile_i; i++)
+    {
+      for (int j = 0; j < inner_tile_j; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+template <>
+template <>
+const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
+{
+  {
+    {
+      {
+        Transform::template process_tile<0, 0, 0, 0>,  // No padding
+        Transform::template process_tile<0, 0, 0, 1>,  // Right
+        Transform::template process_tile<0, 0, 0, 2>,  // Right
+      },
+      {
+        Transform::template process_tile<0, 0, 1, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 1, 1>,  // Bottom-right
+        Transform::template process_tile<0, 0, 1, 2>,  // Bottom-right
+      },
+      {
+        Transform::template process_tile<0, 0, 2, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 2, 1>,  // Bottom-right
+        Transform::template process_tile<0, 0, 2, 2>,  // Bottom-right
+      }
+    },
+    {
+      {
+        Transform::template process_tile<0, 1, 0, 0>,  // Left
+        Transform::template process_tile<0, 1, 0, 1>,  // Left AND right
+        Transform::template process_tile<0, 1, 0, 2>,  // Left AND right
+      },
+      {
+        Transform::template process_tile<0, 1, 1, 0>,  // Left-bottom
+        Transform::template process_tile<0, 1, 1, 1>,  // Left, bottom AND right
+        Transform::template process_tile<0, 1, 1, 2>,  // Left, bottom AND right
+      },
+      {
+        Transform::template process_tile<0, 1, 2, 0>,  // Left-bottom
+        Transform::template process_tile<0, 1, 2, 1>,  // Left, bottom AND right
+        Transform::template process_tile<0, 1, 2, 2>,  // Left, bottom AND right
+      }
+    },
+  },
+  {
+    {
+      {
+        Transform::template process_tile<1, 0, 0, 0>,  // Top
+        Transform::template process_tile<1, 0, 0, 1>,  // Top-right
+        Transform::template process_tile<1, 0, 0, 2>,  // Top-right
+      },
+      {
+        Transform::template process_tile<1, 0, 1, 0>,  // Top AND bottom
+        Transform::template process_tile<1, 0, 1, 1>,  // Top, bottom AND right
+        Transform::template process_tile<1, 0, 1, 2>,  // Top, bottom AND right
+      },
+      {
+        Transform::template process_tile<1, 0, 2, 0>,  // Top AND bottom
+        Transform::template process_tile<1, 0, 2, 1>,  // Top, bottom AND right
+        Transform::template process_tile<1, 0, 2, 2>,  // Top, bottom AND right
+      }
+    },
+    {
+      {
+        Transform::template process_tile<1, 1, 0, 0>,  // Top-left
+        Transform::template process_tile<1, 1, 0, 1>,  // Top, left AND right
+        Transform::template process_tile<1, 1, 0, 2>,  // Top, left AND right
+      },
+      {
+        Transform::template process_tile<1, 1, 1, 0>,  // Top, left AND bottom
+        Transform::template process_tile<1, 1, 1, 1>,  // All padded
+        Transform::template process_tile<1, 1, 1, 2>,  // All padded
+      },
+      {
+        Transform::template process_tile<1, 1, 2, 0>,  // Top, left AND bottom
+        Transform::template process_tile<1, 1, 2, 1>,  // All padded
+        Transform::template process_tile<1, 1, 2, 2>,  // All padded
+      }
+    }
+  }
+};
+
+template struct WinogradGEMM<2, 2, 3, 3>::InputTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp
new file mode 100644
index 0000000..477aaaf
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/input_4x4_3x3_fp32.cpp

@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "transforms/input.hpp"
+#include "winograd_gemm.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+using Transform = WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
+
+template <>
+template <>
+int Transform::ops_performed(const Tensor4DShape &input_shape)
+{
+  // NOTE: Cost in FLOPs rather than instructions or uops.
+  const int tile_M = iceildiv(input_shape.n_rows, inner_tile_rows);
+  const int tile_N = iceildiv(input_shape.n_cols, inner_tile_cols);
+  return 12 * 24 * tile_M * tile_N * input_shape.n_channels;
+}
+
+/* F(4x4, 3x3) implies the use of a 6x6 input tile. Such tiles can require a
+* variety of padding types. For example, tiles at the top and left of an
+* image can require one row or column of padding on their top and left sides
+* if the padding type is SAME (where X represents a padded value):
+*
+*      ___________    ___________
+*     |X X X X X X|  |X X X X X X|
+*     |X          |  |           |
+*     |X          |  |           |
+*     |X          |  |           |
+*     |X          |  |           |
+*     |X__________|  |___________|
+*      ___________
+*     |X          |
+*     |X          |
+*     |X          |
+*     |X          |
+*     |X          |
+*     |X__________|
+*
+* For tiles near the right or bottom of the image it is more complicated.
+* Such tiles might require padding by 0, 1, 2 or 3 rows or columns if the
+* padding type is VALID or 1, 2, 3 or 4 rows or columns if the padding
+* type is SAME.
+*
+* Build an array of the specialised methods that deal with each of the
+* different padding combinations which may be required. These padding
+* constraints are the space:
+*
+*     Padding top in {0, 1}
+*     Padding left in {0, 1}
+*     Padding bottom in {0, 1, 2, 3, 4}
+*     Padding right in {0, 1, 2, 3, 4}
+*/
+template <>
+template <>
+template <int pad_top, int pad_left, int pad_bottom, int pad_right>
+void Transform::process_tile(
+  int n_channels,
+  const float* const input_base,
+  const int input_row_stride,
+  const int input_col_stride,
+  float* const matrix_base,
+  const int matrix_stride
+)
+{
+  constexpr int cells_i = 6 - pad_bottom;
+  constexpr int cells_j = 6 - pad_right;
+
+  float *outptr = matrix_base;
+
+  // Get pointers into the input tile
+  const float *x_ptrs[6][6];
+  for (int i = pad_top, xi = 0; i < cells_i; i++, xi++)
+  {
+    // Get a pointer into the row
+    const float* const row_ptr = input_base + xi*input_row_stride;
+
+    for (int j = pad_left, xj = 0; j < cells_j; j++, xj++)
+    {
+      x_ptrs[i][j] = row_ptr + xj*input_col_stride;
+    }
+  }
+
+  // Matrices used/computed in this kernel.
+  float x[6][6], XTx[6][6], U[6][6];
+  for (int i = 0; i < 6; i++)
+  {
+    for (int j = 0; j < 6; j++)
+    {
+      x[i][j] = XTx[i][j] = 0.0f;
+    }
+  }
+
+  // Perform the Winograd input transformation for each channel in the input
+  // tensor.
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used/computed in this kernel
+    float32x4_t x[6][6], XTx[6][6], U[6][6];
+    for (int i = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++)
+      {
+        x[i][j] = vdupq_n_f32(0.0f);
+        XTx[i][j] = vdupq_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1q_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 4;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmlsq_n_f32(vmlaq_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmlsq_n_f32(vaddq_f32(x[3][j], x[4][j]), vaddq_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[3][j]), vsubq_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmlaq_n_f32(vsubq_f32(x[4][j], x[2][j]), vsubq_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmlsq_n_f32(vmlaq_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < 6; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmlsq_n_f32(vaddq_f32(XTx[i][3], XTx[i][4]), vaddq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][3]), vsubq_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmlaq_n_f32(vsubq_f32(XTx[i][4], XTx[i][2]), vsubq_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmlsq_n_f32(vmlaq_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1q_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 4;
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used/computed in this kernel
+    float32x2_t x[6][6], XTx[6][6], U[6][6];
+    for (int i = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++)
+      {
+        x[i][j] = vdup_n_f32(0.0f);
+        XTx[i][j] = vdup_n_f32(0.0f);
+      }
+    }
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = vld1_f32(x_ptrs[i][j]);
+        x_ptrs[i][j] += 2;
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      // XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[0][j] = vmls_n_f32(vmla_n_f32(x[4][j], x[0][j], 4.0f), x[2][j], 5.0f);
+
+      // XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[1][j] = vmls_n_f32(vadd_f32(x[3][j], x[4][j]), vadd_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[2][j] = vmla_n_f32(vsub_f32(x[4][j], x[3][j]), vsub_f32(x[1][j], x[2][j]), 4.0f);
+
+      // XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[3][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[3][j], x[1][j]), 2.0f);
+
+      // XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[4][j] = vmla_n_f32(vsub_f32(x[4][j], x[2][j]), vsub_f32(x[1][j], x[3][j]), 2.0f);
+
+      // XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+      XTx[5][j] = vmls_n_f32(vmla_n_f32(x[5][j], x[1][j], 4.0f), x[3][j], 5.0f);
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < 6; i++)
+    {
+      // U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][0] = vmls_n_f32(vmla_n_f32(XTx[i][4], XTx[i][0], 4.0f), XTx[i][2], 5.0f);
+
+      // U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][1] = vmls_n_f32(vadd_f32(XTx[i][3], XTx[i][4]), vadd_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][3]), vsub_f32(XTx[i][1], XTx[i][2]), 4.0f);
+
+      // U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][3], XTx[i][1]), 2.0f);
+
+      // U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] = vmla_n_f32(vsub_f32(XTx[i][4], XTx[i][2]), vsub_f32(XTx[i][1], XTx[i][3]), 2.0f);
+
+      // U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+      U[i][5] = vmls_n_f32(vmla_n_f32(XTx[i][5], XTx[i][1], 4.0f), XTx[i][3], 5.0f);
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        vst1_f32(outptr + m*matrix_stride, U[i][j]);
+      }
+    }
+    outptr += 2;
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Load x
+    for (int i = pad_top; i < cells_i; i++)
+    {
+      for (int j = pad_left; j < cells_j; j++)
+      {
+        x[i][j] = *(x_ptrs[i][j]++);
+      }
+    }
+
+    // Compute XT . x
+    for (int j = pad_left; j < cells_j; j++)
+    {
+      XTx[0][j] =  4*x[0][j] + -5*x[2][j] +  1*x[4][j];
+      XTx[1][j] = -4*x[1][j] + -4*x[2][j] +  1*x[3][j] +  1*x[4][j];
+      XTx[2][j] =  4*x[1][j] + -4*x[2][j] + -1*x[3][j] +  1*x[4][j];
+      XTx[3][j] = -2*x[1][j] + -1*x[2][j] +  2*x[3][j] +  1*x[4][j];
+      XTx[4][j] =  2*x[1][j] + -1*x[2][j] + -2*x[3][j] +  1*x[4][j];
+      XTx[5][j] =  4*x[1][j] + -5*x[3][j] +  1*x[5][j];
+    }
+
+    // Compute U = XT . x . X
+    for (int i = 0; i < 6; i++)
+    {
+      U[i][0] =  4*XTx[i][0] + -5*XTx[i][2] +  1*XTx[i][4];
+      U[i][1] = -4*XTx[i][1] + -4*XTx[i][2] +  1*XTx[i][3] +  1*XTx[i][4];
+      U[i][2] =  4*XTx[i][1] + -4*XTx[i][2] + -1*XTx[i][3] +  1*XTx[i][4];
+      U[i][3] = -2*XTx[i][1] + -1*XTx[i][2] +  2*XTx[i][3] +  1*XTx[i][4];
+      U[i][4] =  2*XTx[i][1] + -1*XTx[i][2] + -2*XTx[i][3] +  1*XTx[i][4];
+      U[i][5] =  4*XTx[i][1] + -5*XTx[i][3] +  1*XTx[i][5];
+    }
+
+    // Store the transformed matrix
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        *(outptr + m*matrix_stride) = U[i][j];
+      }
+    }
+    outptr++;
+  }
+}
+
+/* In the below, unusual or especially small tiles are routed via the slow
+ * path whereas common or large tiles are routed through a faster path.
+ */
+template <>
+template <>
+const Transform::TileFn Transform::tile_fns[2][2][max_pad_bottom][max_pad_right] =
+{
+  {
+    {
+      {
+        Transform::template process_tile<0, 0, 0, 0>,  // No padding
+        Transform::template process_tile<0, 0, 0, 1>,  // Right
+        Transform::template process_tile<0, 0, 0, 2>,  // "   "
+        Transform::template process_tile<0, 0, 0, 3>,  // "   "
+        Transform::template process_tile<0, 0, 0, 4>,  // "   "
+      },
+      {
+        Transform::template process_tile<0, 0, 1, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 1, 1>,  // Bottom right
+        Transform::template process_tile<0, 0, 1, 2>,  // "          "
+        Transform::template process_tile<0, 0, 1, 3>,  // "          "
+        Transform::template process_tile<0, 0, 1, 4>,  // "          "
+      },
+      {
+        Transform::template process_tile<0, 0, 2, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 2, 1>,  // Bottom right
+        Transform::template process_tile<0, 0, 2, 2>,  // "          "
+        Transform::template process_tile<0, 0, 2, 3>,  // "          "
+        Transform::template process_tile<0, 0, 2, 4>,  // "          "
+      },
+      {
+        Transform::template process_tile<0, 0, 3, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 3, 1>,  // Bottom right
+        Transform::template process_tile<0, 0, 3, 2>,  // "          "
+        Transform::template process_tile<0, 0, 3, 3>,  // "          "
+        Transform::template process_tile<0, 0, 3, 4>,  // "          "
+      },
+      {
+        Transform::template process_tile<0, 0, 4, 0>,  // Bottom
+        Transform::template process_tile<0, 0, 4, 1>,  // Bottom right
+        Transform::template process_tile<0, 0, 4, 2>,  // "          "
+        Transform::template process_tile<0, 0, 4, 3>,  // "          "
+        Transform::template process_tile<0, 0, 4, 4>,  // "          "
+      }
+    },
+    {
+      {
+        Transform::template process_tile<0, 1, 0, 0>,  // Left
+        Transform::template process_tile<0, 1, 0, 1>,
+        Transform::template process_tile<0, 1, 0, 2>,
+        Transform::template process_tile<0, 1, 0, 3>,
+        Transform::template process_tile<0, 1, 0, 4>,
+      },
+      {
+        Transform::template process_tile<0, 1, 1, 0>,  // Bottom left
+        Transform::template process_tile<0, 1, 1, 1>,
+        Transform::template process_tile<0, 1, 1, 2>,
+        Transform::template process_tile<0, 1, 1, 3>,
+        Transform::template process_tile<0, 1, 1, 4>,
+      },
+      {
+        Transform::template process_tile<0, 1, 2, 0>,  // "          "
+        Transform::template process_tile<0, 1, 2, 1>,
+        Transform::template process_tile<0, 1, 2, 2>,
+        Transform::template process_tile<0, 1, 2, 3>,
+        Transform::template process_tile<0, 1, 2, 4>,
+      },
+      {
+        Transform::template process_tile<0, 1, 3, 0>,  // "          "
+        Transform::template process_tile<0, 1, 3, 1>,
+        Transform::template process_tile<0, 1, 3, 2>,
+        Transform::template process_tile<0, 1, 3, 3>,
+        Transform::template process_tile<0, 1, 3, 4>,
+      },
+      {
+        Transform::template process_tile<0, 1, 4, 0>,  // "          "
+        Transform::template process_tile<0, 1, 4, 1>,
+        Transform::template process_tile<0, 1, 4, 2>,
+        Transform::template process_tile<0, 1, 4, 3>,
+        Transform::template process_tile<0, 1, 4, 4>,
+      }
+    }
+  },
+  {
+    {
+      {
+        Transform::template process_tile<1, 0, 0, 0>,  // Top
+        Transform::template process_tile<1, 0, 0, 1>,  // Top right
+        Transform::template process_tile<1, 0, 0, 2>,  // "       "
+        Transform::template process_tile<1, 0, 0, 3>,  // "       "
+        Transform::template process_tile<1, 0, 0, 4>,  // "       "
+      },
+      {
+        Transform::template process_tile<1, 0, 1, 0>,
+        Transform::template process_tile<1, 0, 1, 1>,
+        Transform::template process_tile<1, 0, 1, 2>,
+        Transform::template process_tile<1, 0, 1, 3>,
+        Transform::template process_tile<1, 0, 1, 4>,
+      },
+      {
+        Transform::template process_tile<1, 0, 2, 0>,
+        Transform::template process_tile<1, 0, 2, 1>,
+        Transform::template process_tile<1, 0, 2, 2>,
+        Transform::template process_tile<1, 0, 2, 3>,
+        Transform::template process_tile<1, 0, 2, 4>,
+      },
+      {
+        Transform::template process_tile<1, 0, 3, 0>,
+        Transform::template process_tile<1, 0, 3, 1>,
+        Transform::template process_tile<1, 0, 3, 2>,
+        Transform::template process_tile<1, 0, 3, 3>,
+        Transform::template process_tile<1, 0, 3, 4>,
+      },
+      {
+        Transform::template process_tile<1, 0, 4, 0>,
+        Transform::template process_tile<1, 0, 4, 1>,
+        Transform::template process_tile<1, 0, 4, 2>,
+        Transform::template process_tile<1, 0, 4, 3>,
+        Transform::template process_tile<1, 0, 4, 4>,
+      },
+    },
+    {
+      {
+        Transform::template process_tile<1, 1, 0, 0>,  // Top left
+        Transform::template process_tile<1, 1, 0, 1>,
+        Transform::template process_tile<1, 1, 0, 2>,
+        Transform::template process_tile<1, 1, 0, 3>,
+        Transform::template process_tile<1, 1, 0, 4>,
+      },
+      {
+        Transform::template process_tile<1, 1, 1, 0>,
+        Transform::template process_tile<1, 1, 1, 1>,
+        Transform::template process_tile<1, 1, 1, 2>,
+        Transform::template process_tile<1, 1, 1, 3>,
+        Transform::template process_tile<1, 1, 1, 4>,
+      },
+      {
+        Transform::template process_tile<1, 1, 2, 0>,
+        Transform::template process_tile<1, 1, 2, 1>,
+        Transform::template process_tile<1, 1, 2, 2>,
+        Transform::template process_tile<1, 1, 2, 3>,
+        Transform::template process_tile<1, 1, 2, 4>,
+      },
+      {
+        Transform::template process_tile<1, 1, 3, 0>,
+        Transform::template process_tile<1, 1, 3, 1>,
+        Transform::template process_tile<1, 1, 3, 2>,
+        Transform::template process_tile<1, 1, 3, 3>,
+        Transform::template process_tile<1, 1, 3, 4>,
+      },
+      {
+        Transform::template process_tile<1, 1, 4, 0>,
+        Transform::template process_tile<1, 1, 4, 1>,
+        Transform::template process_tile<1, 1, 4, 2>,
+        Transform::template process_tile<1, 1, 4, 3>,
+        Transform::template process_tile<1, 1, 4, 4>,
+      }
+    }
+  }
+};
+
+template struct WinogradGEMM<4, 4, 3, 3>::InputTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp
deleted file mode 100644
index 033442a..0000000
--- a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3.hpp
+++ /dev/null

@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-namespace winograd {
-  /* Transform a kernel into the Winograd domain.
-   *
-   * NOTE: It is assumed that the kernel is in the form [height x width x
-   * input_channels x output_channel].
-   */
-  template <typename T>
-  struct winograd2x2_3x3_gemm_kernel_transform_impl{
-    static void execute(
-      const KernelShape &shape,
-      const T* const kernel,
-      T* const matrix_base,
-      const int matrix_stride,
-      const int matrix_row_stride
-    );
-
-    protected:
-    template <const int output_channel_tail>
-    static void transform_kernel(
-      const T* const kernel,
-      const int n_input_channels,
-      const int n_output_channels,
-      T* const matrix_base,
-      const int matrix_stride,
-      const int matrix_row_stride
-    );
-  };
-}
-
-/*****************************************************************************/
-/* Transform a fp32 kernel into the Winograd domain.
- */
-#include "kernel_2x2_3x3/a64_float.hpp"  // AArch64 specialisations
-
-namespace winograd
-{
-template <>
-inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::execute(
-  const KernelShape &shape,
-  const float* const kernel,
-  float* const matrix_base,
-  const int matrix_stride,
-  const int matrix_row_stride
-) {
-  // Delegate based on tail size
-  const int n_input_channels = shape.n_input_channels;
-  const int n_output_channels = shape.n_output_channels;
-
-  switch (n_output_channels % 4) {
-    case 0:
-      transform_kernel<0>(
-        kernel, n_input_channels, n_output_channels,
-        matrix_base, matrix_stride, matrix_row_stride
-      );
-      break;
-    case 1:
-      transform_kernel<1>(
-        kernel, n_input_channels, n_output_channels,
-        matrix_base, matrix_stride, matrix_row_stride
-      );
-      break;
-    case 2:
-      transform_kernel<2>(
-        kernel, n_input_channels, n_output_channels,
-        matrix_base, matrix_stride, matrix_row_stride
-      );
-      break;
-    case 3:
-      transform_kernel<3>(
-        kernel, n_input_channels, n_output_channels,
-        matrix_base, matrix_stride, matrix_row_stride
-      );
-      break;
-    default:
-        ARM_COMPUTE_ERROR("Cannot happen");
-        break;
-  }
-}
-
-template <>
-template<const int output_channel_tail>
-inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel(
-    const float* const kernel,
-    const int n_input_channels,
-    const int n_output_channels,
-    float* const matrix_base,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  // Use one input pointer for each row of the kernel, use two additional
-  // offsets to extract columns.
-  const int kernel_col_stride = n_input_channels * n_output_channels;
-  const int kernel_row_stride = 3 * kernel_col_stride;
-  const float *inptr0 = kernel;
-  const float *inptr1 = kernel + kernel_row_stride;
-  const float *inptr2 = kernel + kernel_row_stride*2;
-
-  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
-  // offsets to extract further matrices.
-  float  *outptr0 = matrix_base;
-  float  *outptr4 = matrix_base + mstride * 4;
-  float  *outptr8 = matrix_base + mstride * 8;
-  float *outptr12 = matrix_base + mstride * 12;
-
-  // For every input channel
-  for (int in_c = 0; in_c < n_input_channels; in_c++) {
-    // For every output channel
-    for (int c = 0; c < n_output_channels; c++) {
-      // Read in the kernel
-      float w11 = inptr0[0], w12 = inptr0[kernel_col_stride], w13 = inptr0[kernel_col_stride*2];
-      float w21 = inptr1[0], w22 = inptr1[kernel_col_stride], w23 = inptr1[kernel_col_stride*2];
-      float w31 = inptr2[0], w32 = inptr2[kernel_col_stride], w33 = inptr2[kernel_col_stride*2];
-
-      // Progress input pointers
-      inptr0++;
-      inptr1++;
-      inptr2++;
-
-      // Compute the kernel W w, note we need only compute the middle two rows
-      // (2 and 3) because the first and last rows are merely copies of values
-      // from the matrix w.
-      float Ww11 = w11, Ww12 = w12, Ww13 = w13;
-      float Ww21 = 0.5*(w11 + w21 + w31), Ww22 = 0.5*(w12 + w22 + w32), Ww23 = 0.5*(w13 + w23 + w33);
-      float Ww31 = 0.5*(w11 - w21 + w31), Ww32 = 0.5*(w12 - w22 + w32), Ww33 = 0.5*(w13 - w23 + w33);
-      float Ww41 = w31, Ww42 = w32, Ww43 = w33;
-
-      // Hence compute W w W.T; again note we need compute only the middle two
-      // columns since the first and last columns are copies of the first and
-      // last columns of the previous matrix.
-      float WwWT11 = Ww11, WwWT12 = 0.5*(Ww11 + Ww12 + Ww13), WwWT13 = 0.5*(Ww11 - Ww12 + Ww13), WwWT14 = Ww13;
-      float WwWT21 = Ww21, WwWT22 = 0.5*(Ww21 + Ww22 + Ww23), WwWT23 = 0.5*(Ww21 - Ww22 + Ww23), WwWT24 = Ww23;
-      float WwWT31 = Ww31, WwWT32 = 0.5*(Ww31 + Ww32 + Ww33), WwWT33 = 0.5*(Ww31 - Ww32 + Ww33), WwWT34 = Ww33;
-      float WwWT41 = Ww41, WwWT42 = 0.5*(Ww41 + Ww42 + Ww43), WwWT43 = 0.5*(Ww41 - Ww42 + Ww43), WwWT44 = Ww43;
-
-      // Store the computed weights
-      outptr0[0 * mstride] = WwWT11;
-      outptr0[1 * mstride] = WwWT12;
-      outptr0[2 * mstride] = WwWT13;
-      outptr0[3 * mstride] = WwWT14;
-
-      outptr4[0 * mstride] = WwWT21;
-      outptr4[1 * mstride] = WwWT22;
-      outptr4[2 * mstride] = WwWT23;
-      outptr4[3 * mstride] = WwWT24;
-
-      outptr8[0 * mstride] = WwWT31;
-      outptr8[1 * mstride] = WwWT32;
-      outptr8[2 * mstride] = WwWT33;
-      outptr8[3 * mstride] = WwWT34;
-
-      outptr12[0 * mstride] = WwWT41;
-      outptr12[1 * mstride] = WwWT42;
-      outptr12[2 * mstride] = WwWT43;
-      outptr12[3 * mstride] = WwWT44;
-
-      // Progress output pointers
-      outptr0++;
-      outptr4++;
-      outptr8++;
-      outptr12++;
-    }
-
-    // Progression to complete stride
-    outptr0 += matrix_row_stride - n_output_channels;
-    outptr4 += matrix_row_stride - n_output_channels;
-    outptr8 += matrix_row_stride - n_output_channels;
-    outptr12 += matrix_row_stride - n_output_channels;
-  }
-}
-}

diff --git a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp
deleted file mode 100644
index 3dd62d1..0000000
--- a/src/core/NEON/kernels/winograd/transforms/kernel_2x2_3x3/a64_float.hpp
+++ /dev/null

@@ -1,822 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-namespace winograd {
-template <>
-template <>
-inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<0>(
-    const float* const kernel,
-    const int n_input_channels,
-    const int n_output_channels,
-    float* const matrix_base,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  // Use one input pointer for each row of the kernel, use two additional
-  // offsets to extract columns.
-  const int kernel_col_stride = n_input_channels * n_output_channels;
-  const int kernel_row_stride = 3 * kernel_col_stride;
-  const float *inptr0 = kernel;
-  const float *inptr1 = kernel + kernel_row_stride;
-  const float *inptr2 = kernel + kernel_row_stride*2;
-
-  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
-  // offsets to extract further matrices.
-  float  *outptr0 = matrix_base;
-  float  *outptr4 = matrix_base + mstride * 4;
-  float  *outptr8 = matrix_base + mstride * 8;
-  float *outptr12 = matrix_base + mstride * 12;
-
-  // For every input channel
-  for (int in_c = 0; in_c < n_input_channels; in_c++) {
-    int n_remaining_channels = n_output_channels;
-
-    asm volatile (
-        // Registers into which to read the kernel
-        "w_11 .req v0\n"  "qw_11 .req q0\n"
-        "w_12 .req v1\n"  "qw_12 .req q1\n"
-        "w_13 .req v2\n"  "qw_13 .req q2\n"
-        "w_21 .req v3\n"  "qw_21 .req q3\n"
-        "w_22 .req v4\n"  "qw_22 .req q4\n"
-        "w_23 .req v5\n"  "qw_23 .req q5\n"
-        "w_31 .req v6\n"  "qw_31 .req q6\n"
-        "w_32 .req v7\n"  "qw_32 .req q7\n"
-        "w_33 .req v8\n"  "qw_33 .req q8\n"
-
-        // Transformed matrix Ww
-        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
-        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
-        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
-        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
-
-        // Output matrix U = WwWT
-        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
-        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
-        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
-        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
-
-        // Storage view of output matrices
-        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
-        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
-        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
-        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
-
-        "half .req v23\n"  // {0.5, ..., 0.5}
-        "dup half.4s, %w[one_half]\n"
-        "scratch .req v24\n"
-
-        "1:"
-          // Load tile of the kernel
-          "ldr qw_11, [%x[inptr0]]\n"
-          "str qU11, [%x[outptr0]]\n"
-          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
-          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
-          "str qU14, [%x[outptr0], %x[mstride3]]\n"
-          "add %x[inptr0], %x[inptr0], #0x10\n"
-
-          "ldr qw_21, [%x[inptr1]]\n"
-          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
-          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
-          "add %x[inptr1], %x[inptr1], #0x10\n"
-
-          "ldr qw_31, [%x[inptr2]]\n"
-          "str qU41, [%x[outptr12]]\n"
-          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
-          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
-          "str qU44, [%x[outptr12], %x[mstride3]]\n"
-          "add %x[inptr2], %x[inptr2], #0x10\n"
-
-          // Compute 2nd and 3rd rows of Ww
-          "fadd scratch.4s, w_11.4s, w_31.4s\n"
-          "fmul Ww21.4s, scratch.4s, half.4s\n"
-          "fmla Ww21.4s, w_21.4s, half.4s\n"
-          "str qU21, [%x[outptr4]]\n"
-          "fmul Ww31.4s, scratch.4s, half.4s\n"
-          "fmls Ww31.4s, w_21.4s, half.4s\n"
-          "str qU31, [%x[outptr8]]\n"
-
-          "fadd scratch.4s, w_12.4s, w_32.4s\n"
-          "fmul Ww22.4s, scratch.4s, half.4s\n"
-          "fmla Ww22.4s, w_22.4s, half.4s\n"
-          "fmul Ww32.4s, scratch.4s, half.4s\n"
-          "fmls Ww32.4s, w_22.4s, half.4s\n"
-
-          "fadd scratch.4s, w_13.4s, w_33.4s\n"
-          "fmul Ww23.4s, scratch.4s, half.4s\n"
-          "fmla Ww23.4s, w_23.4s, half.4s\n"
-          "str qU24, [%x[outptr4], %x[mstride3]]\n"
-          "fmul Ww33.4s, scratch.4s, half.4s\n"
-          "fmls Ww33.4s, w_23.4s, half.4s\n"
-          "str qU34, [%x[outptr8], %x[mstride3]]\n"
-
-          // Compute and store U, only need to compute the 2nd and 3rd columns
-          // of U and update output pointers
-          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
-          "fmul U12.4s, scratch.4s, half.4s\n"
-          "fmla U12.4s, Ww12.4s, half.4s\n"
-          "str qU12, [%x[outptr0], %x[mstride1]]\n"
-          "fmul U13.4s, scratch.4s, half.4s\n"
-          "fmls U13.4s, Ww12.4s, half.4s\n"
-          "str qU13, [%x[outptr0], %x[mstride2]]\n"
-          "add  %x[outptr0],  %x[outptr0], #0x10\n"
-
-          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
-          "fmul U22.4s, scratch.4s, half.4s\n"
-          "fmla U22.4s, Ww22.4s, half.4s\n"
-          "str qU22, [%x[outptr4], %x[mstride1]]\n"
-          "fmul U23.4s, scratch.4s, half.4s\n"
-          "fmls U23.4s, Ww22.4s, half.4s\n"
-          "str qU23, [%x[outptr4], %x[mstride2]]\n"
-          "add  %x[outptr4],  %x[outptr4], #0x10\n"
-
-          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
-          "fmul U32.4s, scratch.4s, half.4s\n"
-          "fmla U32.4s, Ww32.4s, half.4s\n"
-          "str qU32, [%x[outptr8], %x[mstride1]]\n"
-          "fmul U33.4s, scratch.4s, half.4s\n"
-          "fmls U33.4s, Ww32.4s, half.4s\n"
-          "str qU33, [%x[outptr8], %x[mstride2]]\n"
-          "add  %x[outptr8],  %x[outptr8], #0x10\n"
-
-          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
-          "fmul U42.4s, scratch.4s, half.4s\n"
-          "fmla U42.4s, Ww42.4s, half.4s\n"
-          "str qU42, [%x[outptr12], %x[mstride1]]\n"
-          "fmul U43.4s, scratch.4s, half.4s\n"
-          "fmls U43.4s, Ww42.4s, half.4s\n"
-          "str qU43, [%x[outptr12], %x[mstride2]]\n"
-          "add %x[outptr12], %x[outptr12], #0x10\n"
-
-          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
-          "bne 1b\n"
-
-        // Clear aliases
-        ".unreq half\n"
-        ".unreq scratch\n"
-        ".unreq w_11\n"  ".unreq qw_11\n"
-        ".unreq w_12\n"  ".unreq qw_12\n"
-        ".unreq w_13\n"  ".unreq qw_13\n"
-        ".unreq w_21\n"  ".unreq qw_21\n"
-        ".unreq w_22\n"  ".unreq qw_22\n"
-        ".unreq w_23\n"  ".unreq qw_23\n"
-        ".unreq w_31\n"  ".unreq qw_31\n"
-        ".unreq w_32\n"  ".unreq qw_32\n"
-        ".unreq w_33\n"  ".unreq qw_33\n"
-        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
-        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
-        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
-        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
-        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
-        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
-        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
-        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
-        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
-        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
-        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
-        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
-
-      : [inptr0] "+r" (inptr0),
-        [inptr1] "+r" (inptr1),
-        [inptr2] "+r" (inptr2),
-        [outptr0] "+r" (outptr0),
-        [outptr4] "+r" (outptr4),
-        [outptr8] "+r" (outptr8),
-        [outptr12] "+r" (outptr12),
-        [n_remaining_channels] "+r" (n_remaining_channels)
-      : [mstride1] "r" (sizeof(float) * mstride),
-        [mstride2] "r" (sizeof(float) * mstride * 2),
-        [mstride3] "r" (sizeof(float) * mstride * 3),
-        [colstride1] "r" (sizeof(float) * kernel_col_stride),
-        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
-        [one_half] "r" (0.5f)
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24"
-    );
-
-    // Progression to complete stride
-    outptr0 += matrix_row_stride - n_output_channels;
-    outptr4 += matrix_row_stride - n_output_channels;
-    outptr8 += matrix_row_stride - n_output_channels;
-    outptr12 += matrix_row_stride - n_output_channels;
-  }
-}
-
-template <>
-template <>
-inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<2>(
-    const float* const kernel,
-    const int n_input_channels,
-    const int n_output_channels,
-    float* const matrix_base,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  // Use one input pointer for each row of the kernel, use two additional
-  // offsets to extract columns.
-  const int kernel_col_stride = n_input_channels * n_output_channels;
-  const int kernel_row_stride = 3 * kernel_col_stride;
-  const float *inptr0 = kernel;
-  const float *inptr1 = kernel + kernel_row_stride;
-  const float *inptr2 = kernel + kernel_row_stride*2;
-
-  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
-  // offsets to extract further matrices.
-  float  *outptr0 = matrix_base;
-  float  *outptr4 = matrix_base + mstride * 4;
-  float  *outptr8 = matrix_base + mstride * 8;
-  float *outptr12 = matrix_base + mstride * 12;
-
-  // For every input channel
-  for (int in_c = 0; in_c < n_input_channels; in_c++) {
-    int n_remaining_channels = n_output_channels;
-
-    asm volatile (
-        // Registers into which to read the kernel
-        "w_11 .req v0\n"  "qw_11 .req q0\n"  "dw_11 .req d0\n"
-        "w_12 .req v1\n"  "qw_12 .req q1\n"  "dw_12 .req d1\n"
-        "w_13 .req v2\n"  "qw_13 .req q2\n"  "dw_13 .req d2\n"
-        "w_21 .req v3\n"  "qw_21 .req q3\n"  "dw_21 .req d3\n"
-        "w_22 .req v4\n"  "qw_22 .req q4\n"  "dw_22 .req d4\n"
-        "w_23 .req v5\n"  "qw_23 .req q5\n"  "dw_23 .req d5\n"
-        "w_31 .req v6\n"  "qw_31 .req q6\n"  "dw_31 .req d6\n"
-        "w_32 .req v7\n"  "qw_32 .req q7\n"  "dw_32 .req d7\n"
-        "w_33 .req v8\n"  "qw_33 .req q8\n"  "dw_33 .req d8\n"
-
-        // Transformed matrix Ww
-        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
-        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
-        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
-        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
-
-        // Output matrix U = WwWT
-        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
-        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
-        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
-        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
-
-        // Storage view of output matrices
-        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
-        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
-        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
-        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
-
-        "dU11 .req   d0\n"   "dU12 .req d15\n"  "dU13 .req d16\n"  "dU14 .req   d2\n"
-        "dU21 .req   d9\n"   "dU22 .req d17\n"  "dU23 .req d18\n"  "dU24 .req  d11\n"
-        "dU31 .req  d12\n"   "dU32 .req d19\n"  "dU33 .req d20\n"  "dU34 .req  d14\n"
-        "dU41 .req   d6\n"   "dU42 .req d21\n"  "dU43 .req d22\n"  "dU44 .req   d8\n"
-
-        "half .req v23\n"  // {0.5, ..., 0.5}
-        "dup half.4s, %w[one_half]\n"
-        "scratch .req v24\n"
-        
-        // Subtract the tail from the number of remaining channels and jump to
-        // the tail if necessary.
-        "subs %x[n_remaining_channels], %x[n_remaining_channels], #2\n"
-        "beq 2f\n"
-
-        "1:"
-          // Load tile of the kernel
-          "ldr qw_11, [%x[inptr0]]\n"
-          "str qU11, [%x[outptr0]]\n"
-          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
-          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
-          "str qU14, [%x[outptr0], %x[mstride3]]\n"
-          "add %x[inptr0], %x[inptr0], #0x10\n"
-
-          "ldr qw_21, [%x[inptr1]]\n"
-          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
-          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
-          "add %x[inptr1], %x[inptr1], #0x10\n"
-
-          "ldr qw_31, [%x[inptr2]]\n"
-          "str qU41, [%x[outptr12]]\n"
-          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
-          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
-          "str qU44, [%x[outptr12], %x[mstride3]]\n"
-          "add %x[inptr2], %x[inptr2], #0x10\n"
-
-          // Compute 2nd and 3rd rows of Ww
-          "fadd scratch.4s, w_11.4s, w_31.4s\n"
-          "fmul Ww21.4s, scratch.4s, half.4s\n"
-          "fmla Ww21.4s, w_21.4s, half.4s\n"
-          "str qU21, [%x[outptr4]]\n"
-          "fmul Ww31.4s, scratch.4s, half.4s\n"
-          "fmls Ww31.4s, w_21.4s, half.4s\n"
-          "str qU31, [%x[outptr8]]\n"
-
-          "fadd scratch.4s, w_12.4s, w_32.4s\n"
-          "fmul Ww22.4s, scratch.4s, half.4s\n"
-          "fmla Ww22.4s, w_22.4s, half.4s\n"
-          "fmul Ww32.4s, scratch.4s, half.4s\n"
-          "fmls Ww32.4s, w_22.4s, half.4s\n"
-
-          "fadd scratch.4s, w_13.4s, w_33.4s\n"
-          "fmul Ww23.4s, scratch.4s, half.4s\n"
-          "fmla Ww23.4s, w_23.4s, half.4s\n"
-          "str qU24, [%x[outptr4], %x[mstride3]]\n"
-          "fmul Ww33.4s, scratch.4s, half.4s\n"
-          "fmls Ww33.4s, w_23.4s, half.4s\n"
-          "str qU34, [%x[outptr8], %x[mstride3]]\n"
-
-          // Compute and store U, only need to compute the 2nd and 3rd columns
-          // of U and update output pointers
-          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
-          "fmul U12.4s, scratch.4s, half.4s\n"
-          "fmla U12.4s, Ww12.4s, half.4s\n"
-          "str qU12, [%x[outptr0], %x[mstride1]]\n"
-          "fmul U13.4s, scratch.4s, half.4s\n"
-          "fmls U13.4s, Ww12.4s, half.4s\n"
-          "str qU13, [%x[outptr0], %x[mstride2]]\n"
-          "add  %x[outptr0],  %x[outptr0], #0x10\n"
-
-          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
-          "fmul U22.4s, scratch.4s, half.4s\n"
-          "fmla U22.4s, Ww22.4s, half.4s\n"
-          "str qU22, [%x[outptr4], %x[mstride1]]\n"
-          "fmul U23.4s, scratch.4s, half.4s\n"
-          "fmls U23.4s, Ww22.4s, half.4s\n"
-          "str qU23, [%x[outptr4], %x[mstride2]]\n"
-          "add  %x[outptr4],  %x[outptr4], #0x10\n"
-
-          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
-          "fmul U32.4s, scratch.4s, half.4s\n"
-          "fmla U32.4s, Ww32.4s, half.4s\n"
-          "str qU32, [%x[outptr8], %x[mstride1]]\n"
-          "fmul U33.4s, scratch.4s, half.4s\n"
-          "fmls U33.4s, Ww32.4s, half.4s\n"
-          "str qU33, [%x[outptr8], %x[mstride2]]\n"
-          "add  %x[outptr8],  %x[outptr8], #0x10\n"
-
-          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
-          "fmul U42.4s, scratch.4s, half.4s\n"
-          "fmla U42.4s, Ww42.4s, half.4s\n"
-          "str qU42, [%x[outptr12], %x[mstride1]]\n"
-          "fmul U43.4s, scratch.4s, half.4s\n"
-          "fmls U43.4s, Ww42.4s, half.4s\n"
-          "str qU43, [%x[outptr12], %x[mstride2]]\n"
-          "add %x[outptr12], %x[outptr12], #0x10\n"
-
-          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
-          "bne 1b\n"
-
-        // Tail size 2
-        "2:"
-          // Load tile of the kernel
-          "ldr dw_11, [%x[inptr0]]\n"
-          "str dU11, [%x[outptr0]]\n"
-          "ldr dw_12, [%x[inptr0], %x[colstride1]]\n"
-          "ldr dw_13, [%x[inptr0], %x[colstride2]]\n"
-          "str dU14, [%x[outptr0], %x[mstride3]]\n"
-          "add %x[inptr0], %x[inptr0], #0x08\n"
-
-          "ldr dw_21, [%x[inptr1]]\n"
-          "ldr dw_22, [%x[inptr1], %x[colstride1]]\n"
-          "ldr dw_23, [%x[inptr1], %x[colstride2]]\n"
-          "add %x[inptr1], %x[inptr1], #0x08\n"
-
-          "ldr dw_31, [%x[inptr2]]\n"
-          "str dU41, [%x[outptr12]]\n"
-          "ldr dw_32, [%x[inptr2], %x[colstride1]]\n"
-          "ldr dw_33, [%x[inptr2], %x[colstride2]]\n"
-          "str dU44, [%x[outptr12], %x[mstride3]]\n"
-          "add %x[inptr2], %x[inptr2], #0x08\n"
-
-          // Compute 2nd and 3rd rows of Ww
-          "fadd scratch.2s, w_11.2s, w_31.2s\n"
-          "fmul Ww21.2s, scratch.2s, half.2s\n"
-          "fmla Ww21.2s, w_21.2s, half.2s\n"
-          "str dU21, [%x[outptr4]]\n"
-          "fmul Ww31.2s, scratch.2s, half.2s\n"
-          "fmls Ww31.2s, w_21.2s, half.2s\n"
-          "str dU31, [%x[outptr8]]\n"
-
-          "fadd scratch.2s, w_12.2s, w_32.2s\n"
-          "fmul Ww22.2s, scratch.2s, half.2s\n"
-          "fmla Ww22.2s, w_22.2s, half.2s\n"
-          "fmul Ww32.2s, scratch.2s, half.2s\n"
-          "fmls Ww32.2s, w_22.2s, half.2s\n"
-
-          "fadd scratch.2s, w_13.2s, w_33.2s\n"
-          "fmul Ww23.2s, scratch.2s, half.2s\n"
-          "fmla Ww23.2s, w_23.2s, half.2s\n"
-          "str dU24, [%x[outptr4], %x[mstride3]]\n"
-          "fmul Ww33.2s, scratch.2s, half.2s\n"
-          "fmls Ww33.2s, w_23.2s, half.2s\n"
-          "str dU34, [%x[outptr8], %x[mstride3]]\n"
-
-          // Compute and store U, only need to compute the 2nd and 3rd columns of
-          // U and update output pointers
-          "fadd scratch.2s, Ww11.2s, Ww13.2s\n"
-          "fmul U12.2s, scratch.2s, half.2s\n"
-          "fmla U12.2s, Ww12.2s, half.2s\n"
-          "str dU12, [%x[outptr0], %x[mstride1]]\n"
-          "fmul U13.2s, scratch.2s, half.2s\n"
-          "fmls U13.2s, Ww12.2s, half.2s\n"
-          "str dU13, [%x[outptr0], %x[mstride2]]\n"
-          "add  %x[outptr0],  %x[outptr0], #0x08\n"
-
-          "fadd scratch.2s, Ww21.2s, Ww23.2s\n"
-          "fmul U22.2s, scratch.2s, half.2s\n"
-          "fmla U22.2s, Ww22.2s, half.2s\n"
-          "str dU22, [%x[outptr4], %x[mstride1]]\n"
-          "fmul U23.2s, scratch.2s, half.2s\n"
-          "fmls U23.2s, Ww22.2s, half.2s\n"
-          "str dU23, [%x[outptr4], %x[mstride2]]\n"
-          "add  %x[outptr4],  %x[outptr4], #0x08\n"
-
-          "fadd scratch.2s, Ww31.2s, Ww33.2s\n"
-          "fmul U32.2s, scratch.2s, half.2s\n"
-          "fmla U32.2s, Ww32.2s, half.2s\n"
-          "str dU32, [%x[outptr8], %x[mstride1]]\n"
-          "fmul U33.2s, scratch.2s, half.2s\n"
-          "fmls U33.2s, Ww32.2s, half.2s\n"
-          "str dU33, [%x[outptr8], %x[mstride2]]\n"
-          "add  %x[outptr8],  %x[outptr8], #0x08\n"
-
-          "fadd scratch.2s, Ww41.2s, Ww43.2s\n"
-          "fmul U42.2s, scratch.2s, half.2s\n"
-          "fmla U42.2s, Ww42.2s, half.2s\n"
-          "str dU42, [%x[outptr12], %x[mstride1]]\n"
-          "fmul U43.2s, scratch.2s, half.2s\n"
-          "fmls U43.2s, Ww42.2s, half.2s\n"
-          "str dU43, [%x[outptr12], %x[mstride2]]\n"
-          "add %x[outptr12], %x[outptr12], #0x08\n"
-
-        // Clear aliases
-        ".unreq half\n"
-        ".unreq scratch\n"
-        ".unreq w_11\n"  ".unreq qw_11\n" ".unreq dw_11\n"
-        ".unreq w_12\n"  ".unreq qw_12\n" ".unreq dw_12\n"
-        ".unreq w_13\n"  ".unreq qw_13\n" ".unreq dw_13\n"
-        ".unreq w_21\n"  ".unreq qw_21\n" ".unreq dw_21\n"
-        ".unreq w_22\n"  ".unreq qw_22\n" ".unreq dw_22\n"
-        ".unreq w_23\n"  ".unreq qw_23\n" ".unreq dw_23\n"
-        ".unreq w_31\n"  ".unreq qw_31\n" ".unreq dw_31\n"
-        ".unreq w_32\n"  ".unreq qw_32\n" ".unreq dw_32\n"
-        ".unreq w_33\n"  ".unreq qw_33\n" ".unreq dw_33\n"
-        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
-        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
-        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
-        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
-        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
-        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
-        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
-        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
-        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
-        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
-        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
-        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
-        ".unreq dU11\n"  ".unreq dU12\n"  ".unreq dU13\n"  ".unreq dU14\n"
-        ".unreq dU21\n"  ".unreq dU22\n"  ".unreq dU23\n"  ".unreq dU24\n"
-        ".unreq dU31\n"  ".unreq dU32\n"  ".unreq dU33\n"  ".unreq dU34\n"
-        ".unreq dU41\n"  ".unreq dU42\n"  ".unreq dU43\n"  ".unreq dU44\n"
-
-      : [inptr0] "+r" (inptr0),
-        [inptr1] "+r" (inptr1),
-        [inptr2] "+r" (inptr2),
-        [outptr0] "+r" (outptr0),
-        [outptr4] "+r" (outptr4),
-        [outptr8] "+r" (outptr8),
-        [outptr12] "+r" (outptr12),
-        [n_remaining_channels] "+r" (n_remaining_channels)
-      : [mstride1] "r" (sizeof(float) * mstride),
-        [mstride2] "r" (sizeof(float) * mstride * 2),
-        [mstride3] "r" (sizeof(float) * mstride * 3),
-        [colstride1] "r" (sizeof(float) * kernel_col_stride),
-        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
-        [one_half] "r" (0.5f)
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24"
-    );
-
-    // Progression to complete stride
-    outptr0 += matrix_row_stride - n_output_channels;
-    outptr4 += matrix_row_stride - n_output_channels;
-    outptr8 += matrix_row_stride - n_output_channels;
-    outptr12 += matrix_row_stride - n_output_channels;
-  }
-}
-
-template <>
-template <>
-inline void winograd2x2_3x3_gemm_kernel_transform_impl<float>::transform_kernel<1>(
-    const float* const kernel,
-    const int n_input_channels,
-    const int n_output_channels,
-    float* const matrix_base,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  // Use one input pointer for each row of the kernel, use two additional
-  // offsets to extract columns.
-  const int kernel_col_stride = n_input_channels * n_output_channels;
-  const int kernel_row_stride = 3 * kernel_col_stride;
-  const float *inptr0 = kernel;
-  const float *inptr1 = kernel + kernel_row_stride;
-  const float *inptr2 = kernel + kernel_row_stride*2;
-
-  // Use four output pointers, for output matrices 0, 4, 8 and 12. Use three
-  // offsets to extract further matrices.
-  float  *outptr0 = matrix_base;
-  float  *outptr4 = matrix_base + mstride * 4;
-  float  *outptr8 = matrix_base + mstride * 8;
-  float *outptr12 = matrix_base + mstride * 12;
-
-  // For every input channel
-  for (int in_c = 0; in_c < n_input_channels; in_c++) {
-    int n_remaining_channels = n_output_channels;
-
-    asm volatile (
-        // Registers into which to read the kernel
-        "w_11 .req v0\n"  "qw_11 .req q0\n"  "sw_11 .req s0\n"
-        "w_12 .req v1\n"  "qw_12 .req q1\n"  "sw_12 .req s1\n"
-        "w_13 .req v2\n"  "qw_13 .req q2\n"  "sw_13 .req s2\n"
-        "w_21 .req v3\n"  "qw_21 .req q3\n"  "sw_21 .req s3\n"
-        "w_22 .req v4\n"  "qw_22 .req q4\n"  "sw_22 .req s4\n"
-        "w_23 .req v5\n"  "qw_23 .req q5\n"  "sw_23 .req s5\n"
-        "w_31 .req v6\n"  "qw_31 .req q6\n"  "sw_31 .req s6\n"
-        "w_32 .req v7\n"  "qw_32 .req q7\n"  "sw_32 .req s7\n"
-        "w_33 .req v8\n"  "qw_33 .req q8\n"  "sw_33 .req s8\n"
-
-        // Transformed matrix Ww
-        "Ww11 .req w_11\n"  "Ww12 .req w_12\n"  "Ww13 .req w_13\n"
-        "Ww21 .req  v9\n"   "Ww22 .req v10\n"   "Ww23 .req v11\n"
-        "Ww31 .req v12\n"   "Ww32 .req v13\n"   "Ww33 .req v14\n"
-        "Ww41 .req w_31\n"  "Ww42 .req w_32\n"  "Ww43 .req w_33\n"
-
-        // Output matrix U = WwWT
-        "U11 .req Ww11\n"   "U12 .req v15\n"  "U13 .req v16\n"  "U14 .req Ww13\n"
-        "U21 .req Ww21\n"   "U22 .req v17\n"  "U23 .req v18\n"  "U24 .req Ww23\n"
-        "U31 .req Ww31\n"   "U32 .req v19\n"  "U33 .req v20\n"  "U34 .req Ww33\n"
-        "U41 .req Ww41\n"   "U42 .req v21\n"  "U43 .req v22\n"  "U44 .req Ww43\n"
-
-        // Storage view of output matrices
-        "qU11 .req   q0\n"   "qU12 .req q15\n"  "qU13 .req q16\n"  "qU14 .req   q2\n"
-        "qU21 .req   q9\n"   "qU22 .req q17\n"  "qU23 .req q18\n"  "qU24 .req  q11\n"
-        "qU31 .req  q12\n"   "qU32 .req q19\n"  "qU33 .req q20\n"  "qU34 .req  q14\n"
-        "qU41 .req   q6\n"   "qU42 .req q21\n"  "qU43 .req q22\n"  "qU44 .req   q8\n"
-
-        "sU11 .req   s0\n"   "sU12 .req s15\n"  "sU13 .req s16\n"  "sU14 .req   s2\n"
-        "sU21 .req   s9\n"   "sU22 .req s17\n"  "sU23 .req s18\n"  "sU24 .req  s11\n"
-        "sU31 .req  s12\n"   "sU32 .req s19\n"  "sU33 .req s20\n"  "sU34 .req  s14\n"
-        "sU41 .req   s6\n"   "sU42 .req s21\n"  "sU43 .req s22\n"  "sU44 .req   s8\n"
-
-        "half .req v23\n"  // {0.5, ..., 0.5}
-        "dup half.4s, %w[one_half]\n"
-        "scratch .req v24\n"
-        
-        // Subtract the tail from the number of remaining channels and jump to
-        // the tail if necessary.
-        "subs %x[n_remaining_channels], %x[n_remaining_channels], #1\n"
-        "beq 2f\n"
-
-        "1:"
-          // Load tile of the kernel
-          "ldr qw_11, [%x[inptr0]]\n"
-          "str qU11, [%x[outptr0]]\n"
-          "ldr qw_12, [%x[inptr0], %x[colstride1]]\n"
-          "ldr qw_13, [%x[inptr0], %x[colstride2]]\n"
-          "str qU14, [%x[outptr0], %x[mstride3]]\n"
-          "add %x[inptr0], %x[inptr0], #0x10\n"
-
-          "ldr qw_21, [%x[inptr1]]\n"
-          "ldr qw_22, [%x[inptr1], %x[colstride1]]\n"
-          "ldr qw_23, [%x[inptr1], %x[colstride2]]\n"
-          "add %x[inptr1], %x[inptr1], #0x10\n"
-
-          "ldr qw_31, [%x[inptr2]]\n"
-          "str qU41, [%x[outptr12]]\n"
-          "ldr qw_32, [%x[inptr2], %x[colstride1]]\n"
-          "ldr qw_33, [%x[inptr2], %x[colstride2]]\n"
-          "str qU44, [%x[outptr12], %x[mstride3]]\n"
-          "add %x[inptr2], %x[inptr2], #0x10\n"
-
-          // Compute 2nd and 3rd rows of Ww
-          "fadd scratch.4s, w_11.4s, w_31.4s\n"
-          "fmul Ww21.4s, scratch.4s, half.4s\n"
-          "fmla Ww21.4s, w_21.4s, half.4s\n"
-          "str qU21, [%x[outptr4]]\n"
-          "fmul Ww31.4s, scratch.4s, half.4s\n"
-          "fmls Ww31.4s, w_21.4s, half.4s\n"
-          "str qU31, [%x[outptr8]]\n"
-
-          "fadd scratch.4s, w_12.4s, w_32.4s\n"
-          "fmul Ww22.4s, scratch.4s, half.4s\n"
-          "fmla Ww22.4s, w_22.4s, half.4s\n"
-          "fmul Ww32.4s, scratch.4s, half.4s\n"
-          "fmls Ww32.4s, w_22.4s, half.4s\n"
-
-          "fadd scratch.4s, w_13.4s, w_33.4s\n"
-          "fmul Ww23.4s, scratch.4s, half.4s\n"
-          "fmla Ww23.4s, w_23.4s, half.4s\n"
-          "str qU24, [%x[outptr4], %x[mstride3]]\n"
-          "fmul Ww33.4s, scratch.4s, half.4s\n"
-          "fmls Ww33.4s, w_23.4s, half.4s\n"
-          "str qU34, [%x[outptr8], %x[mstride3]]\n"
-
-          // Compute and store U, only need to compute the 2nd and 3rd columns
-          // of U and update output pointers
-          "fadd scratch.4s, Ww11.4s, Ww13.4s\n"
-          "fmul U12.4s, scratch.4s, half.4s\n"
-          "fmla U12.4s, Ww12.4s, half.4s\n"
-          "str qU12, [%x[outptr0], %x[mstride1]]\n"
-          "fmul U13.4s, scratch.4s, half.4s\n"
-          "fmls U13.4s, Ww12.4s, half.4s\n"
-          "str qU13, [%x[outptr0], %x[mstride2]]\n"
-          "add  %x[outptr0],  %x[outptr0], #0x10\n"
-
-          "fadd scratch.4s, Ww21.4s, Ww23.4s\n"
-          "fmul U22.4s, scratch.4s, half.4s\n"
-          "fmla U22.4s, Ww22.4s, half.4s\n"
-          "str qU22, [%x[outptr4], %x[mstride1]]\n"
-          "fmul U23.4s, scratch.4s, half.4s\n"
-          "fmls U23.4s, Ww22.4s, half.4s\n"
-          "str qU23, [%x[outptr4], %x[mstride2]]\n"
-          "add  %x[outptr4],  %x[outptr4], #0x10\n"
-
-          "fadd scratch.4s, Ww31.4s, Ww33.4s\n"
-          "fmul U32.4s, scratch.4s, half.4s\n"
-          "fmla U32.4s, Ww32.4s, half.4s\n"
-          "str qU32, [%x[outptr8], %x[mstride1]]\n"
-          "fmul U33.4s, scratch.4s, half.4s\n"
-          "fmls U33.4s, Ww32.4s, half.4s\n"
-          "str qU33, [%x[outptr8], %x[mstride2]]\n"
-          "add  %x[outptr8],  %x[outptr8], #0x10\n"
-
-          "fadd scratch.4s, Ww41.4s, Ww43.4s\n"
-          "fmul U42.4s, scratch.4s, half.4s\n"
-          "fmla U42.4s, Ww42.4s, half.4s\n"
-          "str qU42, [%x[outptr12], %x[mstride1]]\n"
-          "fmul U43.4s, scratch.4s, half.4s\n"
-          "fmls U43.4s, Ww42.4s, half.4s\n"
-          "str qU43, [%x[outptr12], %x[mstride2]]\n"
-          "add %x[outptr12], %x[outptr12], #0x10\n"
-
-          "subs %x[n_remaining_channels], %x[n_remaining_channels], #4\n"
-          "bne 1b\n"
-
-        // Tail size 1
-        "2:"
-          // Load tile of the kernel
-          "ldr sw_11, [%x[inptr0]]\n"
-          "str sU11, [%x[outptr0]]\n"
-          "ldr sw_12, [%x[inptr0], %x[colstride1]]\n"
-          "ldr sw_13, [%x[inptr0], %x[colstride2]]\n"
-          "str sU14, [%x[outptr0], %x[mstride3]]\n"
-          "add %x[inptr0], %x[inptr0], #0x04\n"
-
-          "ldr sw_21, [%x[inptr1]]\n"
-          "ldr sw_22, [%x[inptr1], %x[colstride1]]\n"
-          "ldr sw_23, [%x[inptr1], %x[colstride2]]\n"
-          "add %x[inptr1], %x[inptr1], #0x04\n"
-
-          "ldr sw_31, [%x[inptr2]]\n"
-          "str sU41, [%x[outptr12]]\n"
-          "ldr sw_32, [%x[inptr2], %x[colstride1]]\n"
-          "ldr sw_33, [%x[inptr2], %x[colstride2]]\n"
-          "str sU44, [%x[outptr12], %x[mstride3]]\n"
-          "add %x[inptr2], %x[inptr2], #0x04\n"
-
-          // Compute 2nd and 3rd rows of Ww
-          "fadd scratch.2s, w_11.2s, w_31.2s\n"
-          "fmul Ww21.2s, scratch.2s, half.2s\n"
-          "fmla Ww21.2s, w_21.2s, half.2s\n"
-          "str sU21, [%x[outptr4]]\n"
-          "fmul Ww31.2s, scratch.2s, half.2s\n"
-          "fmls Ww31.2s, w_21.2s, half.2s\n"
-          "str sU31, [%x[outptr8]]\n"
-
-          "fadd scratch.2s, w_12.2s, w_32.2s\n"
-          "fmul Ww22.2s, scratch.2s, half.2s\n"
-          "fmla Ww22.2s, w_22.2s, half.2s\n"
-          "fmul Ww32.2s, scratch.2s, half.2s\n"
-          "fmls Ww32.2s, w_22.2s, half.2s\n"
-
-          "fadd scratch.2s, w_13.2s, w_33.2s\n"
-          "fmul Ww23.2s, scratch.2s, half.2s\n"
-          "fmla Ww23.2s, w_23.2s, half.2s\n"
-          "str sU24, [%x[outptr4], %x[mstride3]]\n"
-          "fmul Ww33.2s, scratch.2s, half.2s\n"
-          "fmls Ww33.2s, w_23.2s, half.2s\n"
-          "str sU34, [%x[outptr8], %x[mstride3]]\n"
-
-          // Compute and store U, only need to compute the 2nd and 3rd columns of
-          // U and update output pointers
-          "fadd scratch.2s, Ww11.2s, Ww13.2s\n"
-          "fmul U12.2s, scratch.2s, half.2s\n"
-          "fmla U12.2s, Ww12.2s, half.2s\n"
-          "str sU12, [%x[outptr0], %x[mstride1]]\n"
-          "fmul U13.2s, scratch.2s, half.2s\n"
-          "fmls U13.2s, Ww12.2s, half.2s\n"
-          "str sU13, [%x[outptr0], %x[mstride2]]\n"
-          "add  %x[outptr0],  %x[outptr0], #0x04\n"
-
-          "fadd scratch.2s, Ww21.2s, Ww23.2s\n"
-          "fmul U22.2s, scratch.2s, half.2s\n"
-          "fmla U22.2s, Ww22.2s, half.2s\n"
-          "str sU22, [%x[outptr4], %x[mstride1]]\n"
-          "fmul U23.2s, scratch.2s, half.2s\n"
-          "fmls U23.2s, Ww22.2s, half.2s\n"
-          "str sU23, [%x[outptr4], %x[mstride2]]\n"
-          "add  %x[outptr4],  %x[outptr4], #0x04\n"
-
-          "fadd scratch.2s, Ww31.2s, Ww33.2s\n"
-          "fmul U32.2s, scratch.2s, half.2s\n"
-          "fmla U32.2s, Ww32.2s, half.2s\n"
-          "str sU32, [%x[outptr8], %x[mstride1]]\n"
-          "fmul U33.2s, scratch.2s, half.2s\n"
-          "fmls U33.2s, Ww32.2s, half.2s\n"
-          "str sU33, [%x[outptr8], %x[mstride2]]\n"
-          "add  %x[outptr8],  %x[outptr8], #0x04\n"
-
-          "fadd scratch.2s, Ww41.2s, Ww43.2s\n"
-          "fmul U42.2s, scratch.2s, half.2s\n"
-          "fmla U42.2s, Ww42.2s, half.2s\n"
-          "str sU42, [%x[outptr12], %x[mstride1]]\n"
-          "fmul U43.2s, scratch.2s, half.2s\n"
-          "fmls U43.2s, Ww42.2s, half.2s\n"
-          "str sU43, [%x[outptr12], %x[mstride2]]\n"
-          "add %x[outptr12], %x[outptr12], #0x04\n"
-
-        // Clear aliases
-        ".unreq half\n"
-        ".unreq scratch\n"
-        ".unreq w_11\n"  ".unreq qw_11\n" ".unreq sw_11\n"
-        ".unreq w_12\n"  ".unreq qw_12\n" ".unreq sw_12\n"
-        ".unreq w_13\n"  ".unreq qw_13\n" ".unreq sw_13\n"
-        ".unreq w_21\n"  ".unreq qw_21\n" ".unreq sw_21\n"
-        ".unreq w_22\n"  ".unreq qw_22\n" ".unreq sw_22\n"
-        ".unreq w_23\n"  ".unreq qw_23\n" ".unreq sw_23\n"
-        ".unreq w_31\n"  ".unreq qw_31\n" ".unreq sw_31\n"
-        ".unreq w_32\n"  ".unreq qw_32\n" ".unreq sw_32\n"
-        ".unreq w_33\n"  ".unreq qw_33\n" ".unreq sw_33\n"
-        ".unreq Ww11\n"  ".unreq Ww12\n"  ".unreq Ww13\n"
-        ".unreq Ww21\n"  ".unreq Ww22\n"  ".unreq Ww23\n"
-        ".unreq Ww31\n"  ".unreq Ww32\n"  ".unreq Ww33\n"
-        ".unreq Ww41\n"  ".unreq Ww42\n"  ".unreq Ww43\n"
-        ".unreq U11\n"   ".unreq U12\n"   ".unreq U13\n"   ".unreq U14\n"
-        ".unreq U21\n"   ".unreq U22\n"   ".unreq U23\n"   ".unreq U24\n"
-        ".unreq U31\n"   ".unreq U32\n"   ".unreq U33\n"   ".unreq U34\n"
-        ".unreq U41\n"   ".unreq U42\n"   ".unreq U43\n"   ".unreq U44\n"
-        ".unreq qU11\n"  ".unreq qU12\n"  ".unreq qU13\n"  ".unreq qU14\n"
-        ".unreq qU21\n"  ".unreq qU22\n"  ".unreq qU23\n"  ".unreq qU24\n"
-        ".unreq qU31\n"  ".unreq qU32\n"  ".unreq qU33\n"  ".unreq qU34\n"
-        ".unreq qU41\n"  ".unreq qU42\n"  ".unreq qU43\n"  ".unreq qU44\n"
-        ".unreq sU11\n"  ".unreq sU12\n"  ".unreq sU13\n"  ".unreq sU14\n"
-        ".unreq sU21\n"  ".unreq sU22\n"  ".unreq sU23\n"  ".unreq sU24\n"
-        ".unreq sU31\n"  ".unreq sU32\n"  ".unreq sU33\n"  ".unreq sU34\n"
-        ".unreq sU41\n"  ".unreq sU42\n"  ".unreq sU43\n"  ".unreq sU44\n"
-
-      : [inptr0] "+r" (inptr0),
-        [inptr1] "+r" (inptr1),
-        [inptr2] "+r" (inptr2),
-        [outptr0] "+r" (outptr0),
-        [outptr4] "+r" (outptr4),
-        [outptr8] "+r" (outptr8),
-        [outptr12] "+r" (outptr12),
-        [n_remaining_channels] "+r" (n_remaining_channels)
-      : [mstride1] "r" (sizeof(float) * mstride),
-        [mstride2] "r" (sizeof(float) * mstride * 2),
-        [mstride3] "r" (sizeof(float) * mstride * 3),
-        [colstride1] "r" (sizeof(float) * kernel_col_stride),
-        [colstride2] "r" (sizeof(float) * kernel_col_stride * 2),
-        [one_half] "r" (0.5f)
-      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24"
-    );
-
-    // Progression to complete stride
-    outptr0 += matrix_row_stride - n_output_channels;
-    outptr4 += matrix_row_stride - n_output_channels;
-    outptr8 += matrix_row_stride - n_output_channels;
-    outptr12 += matrix_row_stride - n_output_channels;
-  }
-}
-}
-#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp
deleted file mode 100644
index 0992c0b..0000000
--- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp
+++ /dev/null

@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-namespace winograd {
-  /* Transform from the Winograd domain back to the spatial domain.
-   */
-  template <typename T>
-  struct Winograd2x2_3x3GemmOutput {
-    static void execute(
-      const Tensor4DShape &output_shape,
-      T* const matrix_base,
-      const int matrix_stride,
-      const int matrix_row_stride,
-      T* const output
-    );
-
-    protected:
-    /* Specialised implementation method. */
-    template <bool tail_M, bool tail_N, int channel_tail>
-    static void _execute(
-      const Tensor4DShape &output_shape,
-      T *output,
-      const T *input,
-      const int matrix_stride,
-      const int matrix_row_stride
-    );
-  };
-
-  /* Two-stage implementation of the transformation from the Winograd domain.
-   *
-   * First computes Z.F and then computes (Z.F).Z^T.
-   */
-  template <typename T>
-  struct Winograd2x2_3x3GemmOutput_TwoStage {
-    static void execute(
-      const Tensor4DShape &output_shape,
-      T* const matrix_base,
-      const int matrix_stride,
-      const int matrix_row_stride,
-      T* const output
-    );
-
-    protected:
-    template <int channel_tail>
-    static void compute_zf(
-      const int n_rows, const int n_channels,
-      T* const zf, const T* const input[16]
-    );
-
-    template <bool tail_M, bool tail_N, int channel_tail>
-    static void compute_zfzT(
-      const Tensor4DShape &output_shape,
-      T* const output, const T* const zf
-    );
-  };
-}
-
-#include "output_2x2_3x3/a64_float.hpp"
-// #include "output_2x2_3x3/a64_float_two_stage.hpp"
-
-/*****************************************************************************/
-/*
-template <typename T>
-void winograd::Winograd2x2_3x3GemmOutput<T>::execute(
-    const Tensor4DShape &output_shape,
-    const int tile_M,
-    const int tile_N,
-    T* const matrix_base,
-    const int matrix_stride,
-    const int matrix_row_stride,
-    T* const output
-) {
-  T* const antipadding = reinterpret_cast<T *>(malloc(sizeof(T) * output_shape.n_channels));
-
-  // Get input pointers
-  const T* inptrs[16];
-  for (int i = 0; i < 16; i++) {
-    inptrs[i] = matrices[i];
-  }
-
-  for (int batch = 0; batch < output_shape.n_batches; batch++) {
-    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        // Get pointers for each of the 4 output cells required for this computation
-        T* outptrs[4];
-        for (int cell_i = 0, c = 0; cell_i < 2; cell_i++) {
-          for (int cell_j = 0; cell_j < 2; cell_j++, c++) {
-            const int i = tile_i*2 + cell_i;
-            const int j = tile_j*2 + cell_j;
-
-            if (i < output_shape.n_rows && j < output_shape.n_cols) {
-              outptrs[c] = output + (
-                  (batch*output_shape.n_rows + i) * output_shape.n_cols +
-                j) * output_shape.n_channels;
-            } else {
-              outptrs[c] = antipadding;
-            }
-          }  // cell_j
-        }  // cell_i
-
-        for (int n = 0; n < output_shape.n_channels; n++) {
-          // Read 16 values and progress pointers
-          T v[16];
-          for (int i = 0; i < 16; i++) {
-            v[i] = *(inptrs[i]++);
-          }
-
-          // Compute output for 4 pixels
-          *(outptrs[0]++) = v[ 0] + v[ 1] + v[ 2] +
-                            v[ 4] + v[ 5] + v[ 6] +
-                            v[ 8] + v[ 9] + v[10];
-          *(outptrs[1]++) = v[ 1] - v[ 2] - v[ 3] +
-                            v[ 5] - v[ 6] - v[ 7] +
-                            v[ 9] - v[10] - v[11];
-          *(outptrs[2]++) = v[ 4] + v[ 5] + v[ 6] -
-                            v[ 8] - v[ 9] - v[10] -
-                            v[12] - v[13] - v[14];
-          *(outptrs[3]++) = v[ 5] - v[ 6] - v[ 7] -
-                            v[ 9] + v[10] + v[11] -
-                            v[13] + v[14] + v[15];
-        }  // output_channel
-      }  // tile_j
-    }  // tile_i
-  }  // batch
-
-  free(antipadding);
-}
-*/
-
-/*****************************************************************************/
-/*
-template <typename T>
-void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::execute(
-    const Tensor4DShape &output_shape,
-    T* const matrices[16], T* const output
-) {
-  // Allocate memory for the intermediate matrices
-  const int tile_M = iceildiv(output_shape.n_rows, 2);
-  const int tile_N = iceildiv(output_shape.n_cols, 2);
-  const int n_rows = output_shape.n_batches * tile_M * tile_N;
-  const int n_channels = output_shape.n_channels;
-  T* matrices_zf = reinterpret_cast<T*>(
-    calloc(8 * n_rows * n_channels, sizeof(T))
-  );
-  
-  // Perform the first stage transform, computing ZF.
-  // Specializations should dispatch to different methods based on tail size.
-  compute_zf<0>(n_rows, n_channels, matrices_zf, matrices);
-  
-  // Perform the second stage transform, finishing Z F Z^T - variable dispatch
-  // based on size of the output. Specialisations can also dispatch based on
-  // the tail-size of the channel.
-  if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
-    compute_zfzT<true, true, 0>(output_shape, output, matrices_zf);
-  } else if (output_shape.n_rows % 2) {
-    compute_zfzT<true, false, 0>(output_shape, output, matrices_zf);
-  } else if (output_shape.n_cols % 2) {
-    compute_zfzT<false, true, 0>(output_shape, output, matrices_zf);
-  } else {
-    compute_zfzT<false, false, 0>(output_shape, output, matrices_zf);
-  }
-
-  free(reinterpret_cast<void*>(matrices_zf));
-}
-
-template <typename T>
-template <int channel_tail>
-void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::compute_zf(
-    const int n_rows, const int n_channels,
-    T* output, const T* const input[16]
-) {
-  // Extract 8 output pointers
-  T* outptr[8];
-  for (int i = 0; i < 8; i++) {
-    outptr[i] = output + i*n_rows*n_channels;
-  }
-
-  // Copy the 16 input pointers
-  const T* inptr[16];
-  for (int i = 0; i < 16; i++) {
-    inptr[i] = input[i];
-  }
-
-  // For every row of the matrices
-  for (int i = 0; i < n_rows; i++) {
-    // For every channel
-    for (int j = 0; j < n_channels; j++) {
-      // Extract values from the input matrices
-      T val[16];
-      for (int n = 0; n < 16; n++) {
-        val[n] = *(inptr[n]++);
-      }
-
-      // Compute output values
-      *(outptr[0]++) = val[0] + val[1] + val[2];
-      *(outptr[1]++) = val[1] - val[2] - val[3];
-      *(outptr[2]++) = val[4] + val[5] + val[6];
-      *(outptr[3]++) = val[5] - val[6] - val[7];
-      *(outptr[4]++) = val[8] + val[9] + val[10];
-      *(outptr[5]++) = val[9] - val[10] - val[11];
-      *(outptr[6]++) = val[12] + val[13] + val[14];
-      *(outptr[7]++) = val[13] - val[14] - val[15];
-    }
-  }
-}
-
-template <typename T>
-template <bool tail_M, bool tail_N, int channel_tail>
-void winograd::Winograd2x2_3x3GemmOutput_TwoStage<T>::compute_zfzT(
-    const Tensor4DShape &output_shape,
-    T* const output, const T* const input
-) {
-  // Sizing information
-  const int tile_M = output_shape.n_rows / 2;
-  const int tile_N = output_shape.n_cols / 2;
-
-  const int n_rows = (output_shape.n_batches *
-                      (tile_M + (tail_M ? 1 : 0)) *
-                      (tile_N + (tail_N ? 1 : 0)));
-  const int n_channels = output_shape.n_channels;
-
-  // Extract 8 input pointers
-  const T* inptr[8];
-  for (int i = 0; i < 8; i++) {
-    inptr[i] = input + i*n_rows*n_channels;
-  }
-
-  // Extract 4 output pointers
-  T* outptr00 = output;
-  T* outptr01 = outptr00 + n_channels;
-  T* outptr10 = outptr00 + output_shape.n_cols * n_channels;
-  T* outptr11 = outptr10 + n_channels;
-
-  // Progress over the output tiles, generating output values.
-  for (int batch = 0; batch < output_shape.n_batches; batch++) {
-    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          T v[8];
-          for (int i = 0; i < 8; i++) {
-            v[i] = *(inptr[i]++);
-          }
-
-          // Compute the output values and progress the output pointers.
-          *(outptr00++) = v[0] + v[2] + v[4];
-          *(outptr01++) = v[1] + v[3] + v[5];
-          *(outptr10++) = v[2] - v[4] - v[6];
-          *(outptr11++) = v[3] - v[5] - v[7];
-        }
-
-        // Progress the output pointers to the next column
-        outptr00 += n_channels;
-        outptr01 += n_channels;
-        outptr10 += n_channels;
-        outptr11 += n_channels;
-      }
-
-      if (tail_N) {
-        // Only evaluate the left-most columns of the output
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          T v[8];
-          for (int i = 0; i < 4; i++) {
-            v[i * 2] = *inptr[i * 2];
-          }
-          for (int i = 0; i < 8; i++) {
-            inptr[i]++;
-          }
-
-          // Compute the output values and progress the output pointers.
-          *(outptr00++) = v[0] + v[2] + v[4];
-          *(outptr10++) = v[2] - v[4] - v[6];
-        }
-
-        // Progress the output pointers to the next column
-        outptr01 += n_channels;  // Account for being skipped above
-        outptr11 += n_channels;  // Account for being skipped above
-      }
-
-      // Progress the output pointers to the next row
-      outptr00 += output_shape.n_cols * n_channels;
-      outptr01 += output_shape.n_cols * n_channels;
-      outptr10 += output_shape.n_cols * n_channels;
-      outptr11 += output_shape.n_cols * n_channels;
-    }
-
-    if (tail_M) {
-      // Only work on the upper row of the output
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          T v[8];
-          for (int i = 0; i < 8; i++) {
-            v[i] = *(inptr[i]++);
-          }
-
-          // Compute the output values and progress the output pointers.
-          *(outptr00++) = v[0] + v[2] + v[4];
-          *(outptr01++) = v[1] + v[3] + v[5];
-        }
-
-        // Progress the output pointers to the next column
-        outptr00 += n_channels;
-        outptr01 += n_channels;
-        outptr10 += 2 * n_channels;  // Account for being skipped above
-        outptr11 += 2 * n_channels;  // Account for being skipped above
-      }
-
-      if (tail_N) {
-        // Only evaluate the upper-left cell of the output
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          T v[8];
-          for (int i = 0; i < 3; i++) {
-            v[i * 2] = *inptr[i * 2];
-          }
-          for (int i = 0; i < 8; i++) {
-            inptr[i]++;
-          }
-
-          // Compute the output values and progress the output pointers.
-          *(outptr00++) = v[0] + v[2] + v[4];
-        }
-
-        // Progress the output pointers to the next column
-        outptr01 += n_channels;  // Account for being skipped above
-        outptr10 += n_channels;  // Account for being skipped above
-        outptr11 += n_channels;  // Account for being skipped above
-      }
-    }
-  }
-}
-*/

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp
deleted file mode 100644
index bf6ba90..0000000
--- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float.hpp
+++ /dev/null

@@ -1,650 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* Float implementation for AArch64.
- */
-#ifdef __aarch64__
-namespace winograd {
-
-
-template <>
-template <>
-inline void Winograd2x2_3x3GemmOutput<float>::_execute<false, false, 0>(
-    const Tensor4DShape &output_shape,
-    float *output,
-    const float *input,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  const int tile_M = output_shape.n_rows / 2;
-  const int tile_N = output_shape.n_cols / 2;
-  int batch = output_shape.n_batches;
-  float *outptr = output;
-
-  const float *inptr0 = input;
-  const float *inptr4 = input + 4 * mstride;
-  const float *inptr8 = input + 8 * mstride;
-  const float *inptr12 = input + 12 * mstride;
-
-  const size_t col_stride = sizeof(float) * output_shape.n_channels;
-  const size_t row_stride = col_stride * tile_N * 2;
-
-  asm volatile (
-      // Aliases for elements of the input matrix `F`
-      // V-register      Q-register
-      "F11 .req  v0\n" "qF11 .req  q0\n"
-      "F12 .req  v1\n" "qF12 .req  q1\n"
-      "F13 .req  v2\n" "qF13 .req  q2\n"
-      "F14 .req  v3\n" "qF14 .req  q3\n"
-      "F21 .req  v4\n" "qF21 .req  q4\n"
-      "F22 .req  v5\n" "qF22 .req  q5\n"
-      "F23 .req  v6\n" "qF23 .req  q6\n"
-      "F24 .req  v7\n" "qF24 .req  q7\n"
-      "F31 .req  v8\n" "qF31 .req  q8\n"
-      "F32 .req  v9\n" "qF32 .req  q9\n"
-      "F33 .req v10\n" "qF33 .req q10\n"
-      "F34 .req v11\n" "qF34 .req q11\n"
-      "F41 .req v12\n" "qF41 .req q12\n"
-      "F42 .req v13\n" "qF42 .req q13\n"
-      "F43 .req v14\n" "qF43 .req q14\n"
-      "F44 .req v15\n" "qF44 .req q15\n"
-
-      // Aliases for elements of the intermediate matrix `FZ`
-      "FZ11 .req v16\n"
-      "FZ12 .req v17\n"
-      "FZ21 .req v18\n"
-      "FZ22 .req v19\n"
-      "FZ31 .req v20\n"
-      "FZ32 .req v21\n"
-      "FZ41 .req v22\n"
-      "FZ42 .req v23\n"
-
-      // Aliases for elements of the output matrix `f` (called `g` due to case
-      // insensitivity of aliases).
-      " g11 .req v24\n"
-      "qg11 .req q24\n"
-      " g12 .req v25\n"
-      "qg12 .req q25\n"
-      " g21 .req v26\n"
-      "qg21 .req q26\n"
-      " g22 .req v27\n"
-      "qg22 .req q27\n"
-
-      // Prepare the various strides
-      "col_stride .req %x[col_stride]\n"
-      "row_stride .req %x[row_stride]\n"
-      "row_plus_col_stride .req %x[row_plus_col_stride]\n"
-
-      "mstride1 .req %x[mstride1]\n"
-      "mstride2 .req %x[mstride2]\n"
-      "mstride3 .req %x[mstride3]\n"
-
-      "tile_i  .req x19\n"  // Tile row counter
-      "tile_j  .req x20\n"  // Tile column counter
-      "channel .req x21\n"  // Channel counter
-
-      "1:"  // Loop over batches
-        "mov tile_i, %x[tile_M]\n"  // Reset tile row counter
-
-        "2:"  // Loop over rows of tiles
-          "mov tile_j, %x[tile_N]\n"  // Reset tile column counter
-
-          "3:"  // Loop over columns of tiles
-            // Perform initial loads of the matrix `F`
-            "ldr qF11, [%x[inptr0]]\n"
-            "ldr qF12, [%x[inptr0], mstride1]\n"
-            "ldr qF13, [%x[inptr0], mstride2]\n"
-            "ldr qF14, [%x[inptr0], mstride3]\n"
-            "add %x[inptr0], %x[inptr0], #0x10\n"
-            "ldr qF21, [%x[inptr4]]\n"
-            "ldr qF22, [%x[inptr4], mstride1]\n"
-            "subs channel, %x[n_channels], #4\n"  // Reset channel counter
-
-            "ldr qF23, [%x[inptr4], mstride2]\n"
-            "ldr qF24, [%x[inptr4], mstride3]\n"
-            "add %x[inptr4], %x[inptr4], #0x10\n"
-            "beq 5f\n"  // Jump straight to tail if necessary
-
-            "4:"  // Loop over channels
-              "ldr qF31, [%x[inptr8]]\n"
-              "fadd FZ11.4s,  F11.4s, F12.4s\n"
-
-              "ldr qF32, [%x[inptr8], mstride1]\n"
-              "fsub FZ12.4s,  F12.4s, F13.4s\n"
-
-              "ldr qF33, [%x[inptr8], mstride2]\n"
-              "fadd FZ11.4s, FZ11.4s, F13.4s\n"
-
-              "ldr qF34, [%x[inptr8], mstride3]\n"
-              "fsub FZ12.4s, FZ12.4s, F14.4s\n"
-
-              "ldr qF41, [%x[inptr12]]\n"
-              "fadd FZ21.4s,  F21.4s, F22.4s\n"
-
-              "ldr qF42, [%x[inptr12], mstride1]\n"
-              "fsub FZ22.4s,  F22.4s, F23.4s\n"
-
-              "ldr qF43, [%x[inptr12], mstride2]\n"
-              "fadd FZ21.4s, FZ21.4s, F23.4s\n"
-
-              "ldr qF44, [%x[inptr12], mstride3]\n"
-              "fsub FZ22.4s, FZ22.4s, F24.4s\n"
-
-              "fadd FZ31.4s,  F31.4s, F32.4s\n"
-              "add %x[inptr8], %x[inptr8], #0x10\n"
-
-              "fsub FZ32.4s,  F32.4s, F33.4s\n"
-              "add %x[inptr12], %x[inptr12], #0x10\n"
-
-              "fadd FZ31.4s, FZ31.4s, F33.4s\n"
-
-              "fsub FZ32.4s, FZ32.4s, F34.4s\n"
-
-              "fadd g11.4s, FZ11.4s, FZ21.4s\n"
-
-              "fadd g12.4s, FZ12.4s, FZ22.4s\n"
-
-              "fadd g11.4s,  g11.4s, FZ31.4s\n"
-
-              "fadd g12.4s,  g12.4s, FZ32.4s\n"
-
-              "ldr qF11, [%x[inptr0]]\n"
-              "fadd FZ41.4s,  F41.4s, F42.4s\n"
-
-              "ldr qF12, [%x[inptr0], mstride1]\n"
-              "fsub g21.4s, FZ21.4s, FZ31.4s\n"
-
-              "ldr qF13, [%x[inptr0], mstride2]\n"
-              "fsub FZ42.4s,  F42.4s, F43.4s\n"
-
-              "ldr qF14, [%x[inptr0], mstride3]\n"
-              "str qg11, [%x[outptr]]\n"
-
-              "ldr qF21, [%x[inptr4]]\n"
-              "fadd FZ41.4s, FZ41.4s, F43.4s\n"
-
-              "ldr qF22, [%x[inptr4], mstride1]\n"
-              "str qg12, [%x[outptr], col_stride]\n"
-
-              "ldr qF23, [%x[inptr4], mstride2]\n"
-              "fsub FZ42.4s, FZ42.4s, F44.4s\n"
-
-              "ldr qF24, [%x[inptr4], mstride3]\n"
-              "fsub g22.4s, FZ22.4s, FZ32.4s\n"
-
-              "fsub g21.4s,  g21.4s, FZ41.4s\n"
-              "add %x[inptr0], %x[inptr0], #0x10\n"
-
-              "fsub g22.4s,  g22.4s, FZ42.4s\n"
-              "add %x[inptr4], %x[inptr4], #0x10\n"
-
-              "subs channel, channel, #4\n"
-
-              "str qg21, [%x[outptr], row_stride]\n"
-
-              "str qg22, [%x[outptr], row_plus_col_stride]\n"
-
-              "add %x[outptr], %x[outptr], #0x10\n"
-
-              "bne 4b\n"
-
-            "5:"  // Channel tail
-              "ldr qF31, [%x[inptr8]]\n"
-              "fadd FZ11.4s,  F11.4s, F12.4s\n"
-
-              "ldr qF32, [%x[inptr8], mstride1]\n"
-              "fsub FZ12.4s,  F12.4s, F13.4s\n"
-
-              "ldr qF33, [%x[inptr8], mstride2]\n"
-              "fadd FZ11.4s, FZ11.4s, F13.4s\n"
-
-              "ldr qF34, [%x[inptr8], mstride3]\n"
-              "fsub FZ12.4s, FZ12.4s, F14.4s\n"
-
-              "ldr qF41, [%x[inptr12]]\n"
-              "fadd FZ21.4s,  F21.4s, F22.4s\n"
-
-              "ldr qF42, [%x[inptr12], mstride1]\n"
-              "fsub FZ22.4s,  F22.4s, F23.4s\n"
-
-              "ldr qF43, [%x[inptr12], mstride2]\n"
-              "fadd FZ21.4s, FZ21.4s, F23.4s\n"
-
-              "ldr qF44, [%x[inptr12], mstride3]\n"
-              "fsub FZ22.4s, FZ22.4s, F24.4s\n"
-
-              "fadd FZ31.4s,  F31.4s, F32.4s\n"
-              "add %x[inptr8], %x[inptr8], #0x10\n"
-
-              "fsub FZ32.4s,  F32.4s, F33.4s\n"
-              "add %x[inptr12], %x[inptr12], #0x10\n"
-
-              "fadd FZ31.4s, FZ31.4s, F33.4s\n"
-
-              "fsub FZ32.4s, FZ32.4s, F34.4s\n"
-
-              "fadd g11.4s, FZ11.4s, FZ21.4s\n"
-
-              "fadd g12.4s, FZ12.4s, FZ22.4s\n"
-
-              "fadd g11.4s,  g11.4s, FZ31.4s\n"
-
-              "fadd g12.4s,  g12.4s, FZ32.4s\n"
-
-              "fadd FZ41.4s,  F41.4s, F42.4s\n"
-
-              "fsub g21.4s, FZ21.4s, FZ31.4s\n"
-
-              "fsub FZ42.4s,  F42.4s, F43.4s\n"
-
-              "str qg11, [%x[outptr]]\n"
-
-              "fadd FZ41.4s, FZ41.4s, F43.4s\n"
-
-              "str qg12, [%x[outptr], col_stride]\n"
-
-              "fsub FZ42.4s, FZ42.4s, F44.4s\n"
-
-              "fsub g22.4s, FZ22.4s, FZ32.4s\n"
-
-              "fsub g21.4s,  g21.4s, FZ41.4s\n"
-
-              "fsub g22.4s,  g22.4s, FZ42.4s\n"
-
-              "subs channel, channel, #4\n"
-
-              "str qg21, [%x[outptr], row_stride]\n"
-
-              // Progress input pointers to the next row of the matrix
-              "add  %x[inptr0],  %x[inptr0], %x[mrowpad]\n"
-              "add  %x[inptr4],  %x[inptr4], %x[mrowpad]\n"
-              "add  %x[inptr8],  %x[inptr8], %x[mrowpad]\n"
-              "add %x[inptr12], %x[inptr12], %x[mrowpad]\n"
-
-              "str qg22, [%x[outptr], row_plus_col_stride]\n"
-
-              "add %x[outptr], %x[outptr], #0x10\n"
-
-
-            "add %x[outptr], %x[outptr], col_stride\n"
-            "subs tile_j, tile_j, #1\n"
-            "bne 3b\n"
-
-          "add %x[outptr], %x[outptr], row_stride\n"
-          "subs tile_i, tile_i, #1\n"
-          "bne 2b\n"
-
-        "subs %w[batch], %w[batch], #1\n"
-        "bne 1b\n"
-
-      ".unreq  F11\n" ".unreq qF11\n"
-      ".unreq  F12\n" ".unreq qF12\n"
-      ".unreq  F13\n" ".unreq qF13\n"
-      ".unreq  F14\n" ".unreq qF14\n"
-      ".unreq  F21\n" ".unreq qF21\n"
-      ".unreq  F22\n" ".unreq qF22\n"
-      ".unreq  F23\n" ".unreq qF23\n"
-      ".unreq  F24\n" ".unreq qF24\n"
-      ".unreq  F31\n" ".unreq qF31\n"
-      ".unreq  F32\n" ".unreq qF32\n"
-      ".unreq  F33\n" ".unreq qF33\n"
-      ".unreq  F34\n" ".unreq qF34\n"
-      ".unreq  F41\n" ".unreq qF41\n"
-      ".unreq  F42\n" ".unreq qF42\n"
-      ".unreq  F43\n" ".unreq qF43\n"
-      ".unreq  F44\n" ".unreq qF44\n"
-
-      ".unreq FZ11\n" ".unreq FZ12\n"
-      ".unreq FZ21\n" ".unreq FZ22\n"
-      ".unreq FZ31\n" ".unreq FZ32\n"
-      ".unreq FZ41\n" ".unreq FZ42\n"
-
-      ".unreq  g11\n" ".unreq qg11\n"
-      ".unreq  g12\n" ".unreq qg12\n"
-      ".unreq  g21\n" ".unreq qg21\n"
-      ".unreq  g22\n" ".unreq qg22\n"
-
-      ".unreq col_stride\n"
-      ".unreq row_stride\n"
-      ".unreq row_plus_col_stride\n"
-
-      ".unreq mstride1\n"
-      ".unreq mstride2\n"
-      ".unreq mstride3\n"
-
-      ".unreq tile_i \n"
-      ".unreq tile_j \n"
-      ".unreq channel\n"
-
-    : [batch] "+r" (batch),
-      [outptr] "+r" (outptr),
-      [inptr0] "+r" (inptr0),
-      [inptr4] "+r" (inptr4),
-      [inptr8] "+r" (inptr8),
-      [inptr12] "+r" (inptr12)
-    : [tile_M] "r" (tile_M),
-      [tile_N] "r" (tile_N),
-      [n_channels] "r" (output_shape.n_channels),
-      [col_stride] "r" (col_stride),
-      [row_stride] "r" (row_stride),
-      [row_plus_col_stride] "r" (row_stride + col_stride),
-      [mstride1] "r" (mstride * sizeof(float)),
-      [mstride2] "r" (2 * mstride * sizeof(float)),
-      [mstride3] "r" (3 * mstride * sizeof(float)),
-      [mrowpad] "r" ((matrix_row_stride - output_shape.n_channels) * sizeof(float))
-    : "x19", "x20", "x21",
-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
-      "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
-      "v22", "v23", "v24", "v25", "v26", "v27",
-      "cc", "memory"
-  );
-}
-
-template <>
-template <bool tail_M, bool tail_N, const int channel_tail>
-inline void Winograd2x2_3x3GemmOutput<float>::_execute(
-    const Tensor4DShape &output_shape,
-    float *output,
-    const float *input,
-    const int mstride,
-    const int matrix_row_stride
-) {
-  // Compute basic information about the shape of the matrices
-  const int tile_M = output_shape.n_rows / 2;
-  const int tile_N = output_shape.n_cols / 2;
-  const int n_channels = output_shape.n_channels;
-
-  // Extract 16 input pointers
-  const float* inptr[16];
-  for (int i = 0; i < 16; i++) {
-    inptr[i] = input + i*mstride;
-  }
-
-  // Extract 4 output pointers
-  float *outptr00 = output;
-  float *outptr01 = outptr00 + n_channels;
-  float *outptr10 = outptr00 + output_shape.n_cols * n_channels;
-  float *outptr11 = outptr10 + n_channels;
-
-  // Progress over the output tiles, generating output values.
-  for (int batch = 0; batch < output_shape.n_batches; batch++) {
-    for (int tile_i = 0; tile_i < tile_M; tile_i++) {
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          float F[4][4];
-          for (int i = 0; i < 4; i++) {
-            for (int j = 0; j < 4; j++) {
-              F[i][j] = *(inptr[i*4 + j]++);
-            }
-          }
-
-          // Compute the matrix F.Z
-          float ZF[4][2];
-          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
-          ZF[0][1] = F[0][1] - F[0][2] - F[0][3];
-          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
-          ZF[1][1] = F[1][1] - F[1][2] - F[1][3];
-          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
-          ZF[2][1] = F[2][1] - F[2][2] - F[2][3];
-          ZF[3][0] = F[3][0] + F[3][1] + F[3][2];
-          ZF[3][1] = F[3][1] - F[3][2] - F[3][3];
-
-          // Hence compute the output matrix Z^T . (F.Z)
-          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
-          *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1];
-          *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0];
-          *(outptr11++) = ZF[1][1] - ZF[2][1] - ZF[3][1];
-        }
-
-        // Progress the input pointers to the next row
-        for (int i = 0; i < 16; i++) {
-          inptr[i] += matrix_row_stride - n_channels;
-        }
-
-        // Progress the output pointers to the next column
-        outptr00 += n_channels;
-        outptr01 += n_channels;
-        outptr10 += n_channels;
-        outptr11 += n_channels;
-      }
-
-      if (tail_N) {
-        // Only evaluate the left-most columns of the output
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          float F[4][3];
-          for (int i = 0; i < 4; i++) {
-            for (int j = 0; j < 3; j++) {
-              F[i][j] = *(inptr[i*4 + j]++);
-            }
-          }
-          for (int i = 0; i < 4; i++) {
-            inptr[i*4 + 3]++;
-          }
-
-          // Compute the matrix F.Z
-          float ZF[4][1];
-          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
-          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
-          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
-          ZF[3][0] = F[3][0] + F[3][1] + F[3][2];
-
-          // Hence compute the output matrix Z^T . (F.Z)
-          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
-          *(outptr10++) = ZF[1][0] - ZF[2][0] - ZF[3][0];
-        }
-
-        // Progress the input pointers to the next row
-        for (int i = 0; i < 16; i++) {
-          inptr[i] += matrix_row_stride - n_channels;
-        }
-
-        // Progress the output pointers to the next column
-        outptr01 += n_channels;  // Account for being skipped above
-        outptr11 += n_channels;  // Account for being skipped above
-      }
-
-      // Progress the output pointers to the next row
-      outptr00 += output_shape.n_cols * n_channels;
-      outptr01 += output_shape.n_cols * n_channels;
-      outptr10 += output_shape.n_cols * n_channels;
-      outptr11 += output_shape.n_cols * n_channels;
-    }
-
-    if (tail_M) {
-      // Only work on the upper row of the output
-      for (int tile_j = 0; tile_j < tile_N; tile_j++) {
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          float F[3][4];
-          for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 4; j++) {
-              F[i][j] = *(inptr[i*4 + j]++);
-            }
-          }
-          for (int j = 0; j < 4; j++) {
-            inptr[12 + j]++;
-          }
-
-          // Compute the matrix F.Z
-          float ZF[3][2];
-          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
-          ZF[0][1] = F[0][1] - F[0][2] - F[0][3];
-          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
-          ZF[1][1] = F[1][1] - F[1][2] - F[1][3];
-          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
-          ZF[2][1] = F[2][1] - F[2][2] - F[2][3];
-
-          // Hence compute the output matrix Z^T . (F.Z)
-          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
-          *(outptr01++) = ZF[0][1] + ZF[1][1] + ZF[2][1];
-        }
-
-        // Progress the input pointers to the next row
-        for (int i = 0; i < 16; i++) {
-          inptr[i] += matrix_row_stride - n_channels;
-        }
-
-        // Progress the output pointers to the next column
-        outptr00 += n_channels;
-        outptr01 += n_channels;
-        outptr10 += 2 * n_channels;  // Account for being skipped above
-        outptr11 += 2 * n_channels;  // Account for being skipped above
-      }
-
-      if (tail_N) {
-        // Only evaluate the upper-left cell of the output
-        for (int channel = 0; channel < n_channels; channel++) {
-          // Read values from the input pointers
-          float F[3][3];
-          for (int i = 0; i < 3; i++) {
-            for (int j = 0; j < 3; j++) {
-              F[i][j] = *(inptr[i*4 + j]);
-            }
-          }
-          for (int i = 0; i < 16; i++) {
-            inptr[i]++;
-          }
-
-          // Compute the matrix F.Z
-          float ZF[3][1];
-          ZF[0][0] = F[0][0] + F[0][1] + F[0][2];
-          ZF[1][0] = F[1][0] + F[1][1] + F[1][2];
-          ZF[2][0] = F[2][0] + F[2][1] + F[2][2];
-
-          // Hence compute the output matrix Z^T . (F.Z)
-          *(outptr00++) = ZF[0][0] + ZF[1][0] + ZF[2][0];
-        }
-
-        // Progress the input pointers to the next row
-        for (int i = 0; i < 16; i++) {
-          inptr[i] += matrix_row_stride - n_channels;
-        }
-
-        // Progress the output pointers to the next column
-        outptr01 += n_channels;  // Account for being skipped above
-        outptr10 += n_channels;  // Account for being skipped above
-        outptr11 += n_channels;  // Account for being skipped above
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-template <>
-inline void Winograd2x2_3x3GemmOutput<float>::execute(
-    const Tensor4DShape &output_shape,
-    float* const matrix_base,
-    const int matrix_stride,
-    const int matrix_row_stride,
-    float* const output
-) {
-  // Dispatch to an appropriate implementation based on the shape of the output
-  // tensor.
-  if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
-    constexpr bool tail_M = true, tail_N = true;
-    switch (output_shape.n_channels % 4) {
-      case 0:
-        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 1:
-        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 2:
-        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 3:
-        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  } else if (output_shape.n_rows % 2) {
-    constexpr bool tail_M = true, tail_N = false;
-    switch (output_shape.n_channels % 4) {
-      case 0:
-        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 1:
-        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 2:
-        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 3:
-        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  } else if (output_shape.n_cols % 2) {
-    constexpr bool tail_M = false, tail_N = true;
-    switch (output_shape.n_channels % 4) {
-      case 0:
-        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 1:
-        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 2:
-        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 3:
-        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      default:
-        assert(0);
-        break;
-
-    }
-  } else {
-    constexpr bool tail_M = false, tail_N = false;
-    switch (output_shape.n_channels % 4) {
-      case 0:
-        _execute<tail_M, tail_N, 0>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 1:
-        _execute<tail_M, tail_N, 1>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 2:
-        _execute<tail_M, tail_N, 2>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      case 3:
-        _execute<tail_M, tail_N, 3>(output_shape, output, matrix_base, matrix_stride, matrix_row_stride);
-        break;
-      default:
-        assert(0);
-        break;
-
-    }
-  }
-}
-/*****************************************************************************/
-
-}  // namespace winograd
-#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp
deleted file mode 100644
index f551b12..0000000
--- a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3/a64_float_two_stage.hpp
+++ /dev/null

@@ -1,655 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-/*****************************************************************************/
-// Compute ZF specializations
-
-template <>
-template <>
-inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::compute_zf<0>(
-    const int n_rows, const int n_channels,
-    float* output, const float* const input[16]
-) {
-  // Make copies of some variables
-  int row = n_rows;
-  float* outptr = output;
-  const float* inptr = input[0];
-
-  // Perform the transformation
-  asm volatile (
-    // "inptr0 .req %x[inptr]\n"
-    "inptr1 .req x0\n"
-    "inptr2 .req x1\n"
-    "inptr3 .req x2\n"
-    "inptr4 .req x3\n"
-    "inptr5 .req x4\n"
-    "inptr6 .req x5\n"
-    "inptr7 .req x6\n"
-    "inptr8 .req x7\n"
-    "inptr9 .req x8\n"
-    "inptr10 .req x9\n"
-    "inptr11 .req x10\n"
-    "inptr12 .req x11\n"
-    "inptr13 .req x12\n"
-    "inptr14 .req x13\n"
-    "inptr15 .req x14\n"
-
-    // "outptr0 .req %x[outptr]\n"
-    "outptr1 .req x15\n"
-    "outptr2 .req x16\n"
-    "outptr3 .req x17\n"
-    "outptr4 .req x18\n"
-    "outptr5 .req x19\n"
-    "outptr6 .req x20\n"
-    "outptr7 .req x21\n"
-
-    // Compute additional pointers into the input and output matrices.
-    "mstride .req x22\n"  // Matrix stride
-    "mul mstride, %x[row], %x[n_channels]\n"
-    "lsl mstride, mstride, #2\n"  // * sizeof(float)
-
-    "add inptr1, %x[inptr], mstride\n"
-    "add inptr2, %x[inptr], mstride, LSL #1\n"
-    "add inptr3, inptr2, mstride\n"
-    "add inptr4, inptr3, mstride\n"
-    "add inptr5, inptr4, mstride\n"
-    "add inptr6, inptr5, mstride\n"
-    "add inptr7, inptr6, mstride\n"
-    "add inptr8, inptr7, mstride\n"
-    "add inptr9, inptr8, mstride\n"
-    "add inptr10, inptr9, mstride\n"
-    "add inptr11, inptr10, mstride\n"
-    "add inptr12, inptr11, mstride\n"
-    "add inptr13, inptr12, mstride\n"
-    "add inptr14, inptr13, mstride\n"
-    "add inptr15, inptr14, mstride\n"
-
-    "add outptr1, %[outptr], mstride\n"
-    "add outptr2, outptr1, mstride\n"
-    "add outptr3, outptr2, mstride\n"
-    "add outptr4, outptr3, mstride\n"
-    "add outptr5, outptr4, mstride\n"
-    "add outptr6, outptr5, mstride\n"
-    "add outptr7, outptr6, mstride\n"
-
-    ".unreq mstride\n"
-
-    "column .req x22\n"  // Column loop counter
-
-    "1:"  // Loop over rows
-      "ldr q0, [%x[inptr]], #0x10\n"
-      "ldr q1, [inptr1], #0x10\n"
-      "ldr q2, [inptr2], #0x10\n"
-      "ldr q3, [inptr3], #0x10\n"
-      "ldr q4, [inptr4], #0x10\n"
-      "ldr q5, [inptr5], #0x10\n"
-      "ldr q6, [inptr6], #0x10\n"
-      "ldr q7, [inptr7], #0x10\n"
-      "subs column, %x[n_channels], #0x4\n"
-      "beq 3f\n"
-
-      "2:"  // Loop over columns
-        "ldr q8, [inptr8], #0x10\n"
-        "prfm pldl1keep, [%x[inptr], #196]\n"
-        "fadd v16.4s, v0.4s, v1.4s\n"
-
-        "ldr q9, [inptr9], #0x10\n"
-        "prfm pldl1keep, [inptr1, #196]\n"
-        "fsub v17.4s, v1.4s, v2.4s\n"
-
-        "ldr q10, [inptr10], #0x10\n"
-        "prfm pldl1keep, [inptr2, #196]\n"
-        "fadd v16.4s, v16.4s, v2.4s\n"
-
-        "ldr q11, [inptr11], #0x10\n"
-        "prfm pldl1keep, [inptr3, #196]\n"
-        "fsub v17.4s, v17.4s, v3.4s\n"
-
-        "ldr q12, [inptr12], #0x10\n"
-        "prfm pldl1keep, [inptr4, #196]\n"
-        "str q16, [%x[outptr]], #0x10\n"
-
-        "ldr q13, [inptr13], #0x10\n"
-        "prfm pldl1keep, [inptr5, #196]\n"
-        "str q17, [outptr1], #0x10\n"
-
-        "ldr q14, [inptr14], #0x10\n"
-        "prfm pldl1keep, [inptr6, #196]\n"
-        "fadd v16.4s, v4.4s, v5.4s\n"
-
-        "ldr q15, [inptr15], #0x10\n"
-        "prfm pldl1keep, [inptr7, #196]\n"
-        "fsub v17.4s, v5.4s, v6.4s\n"
-
-        "ldr q0, [%x[inptr]], #0x10\n"
-        "prfm pldl1keep, [inptr8, #196]\n"
-        "fadd v16.4s, v16.4s, v6.4s\n"
-
-        "ldr q1, [inptr1], #0x10\n"
-        "prfm pldl1keep, [inptr9, #196]\n"
-        "fsub v17.4s, v17.4s, v7.4s\n"
-
-        "ldr q2, [inptr2], #0x10\n"
-        "prfm pldl1keep, [inptr10, #196]\n"
-        "str q16, [outptr2], #0x10\n"
-
-        "ldr q3, [inptr3], #0x10\n"
-        "prfm pldl1keep, [inptr11, #196]\n"
-        "str q17, [outptr3], #0x10\n"
-
-        "ldr q4, [inptr4], #0x10\n"
-        "prfm pldl1keep, [inptr12, #196]\n"
-        "fadd v16.4s, v8.4s, v9.4s\n"
-
-        "ldr q5, [inptr5], #0x10\n"
-        "prfm pldl1keep, [inptr13, #196]\n"
-        "fsub v17.4s, v9.4s, v10.4s\n"
-
-        "ldr q6, [inptr6], #0x10\n"
-        "prfm pldl1keep, [inptr14, #196]\n"
-        "fadd v16.4s, v16.4s, v10.4s\n"
-
-        "ldr q7, [inptr7], #0x10\n"
-        "prfm pldl1keep, [inptr15, #196]\n"
-        "fsub v17.4s, v17.4s, v11.4s\n"
-
-        "str q16, [outptr4], #0x10\n"
-        "fadd v16.4s, v12.4s, v13.4s\n"
-        "fsub v18.4s, v13.4s, v14.4s\n"
-
-        "str q17, [outptr5], #0x10\n"
-        "fadd v16.4s, v16.4s, v14.4s\n"
-        "fsub v18.4s, v18.4s, v15.4s\n"
-
-        "str q16, [outptr6], #0x10\n"
-        "subs column, column, #0x4\n"
-
-        "str q18, [outptr7], #0x10\n"
-        "bne 2b\n"
-
-      "3:"  // Tail
-        "ldr q8, [inptr8], #0x10\n"
-        "prfm pldl1keep, [%x[inptr], #196]\n"
-        "fadd v16.4s, v0.4s, v1.4s\n"
-
-        "ldr q9, [inptr9], #0x10\n"
-        "prfm pldl1keep, [inptr1, #196]\n"
-        "fsub v17.4s, v1.4s, v2.4s\n"
-
-        "ldr q10, [inptr10], #0x10\n"
-        "prfm pldl1keep, [inptr2, #196]\n"
-        "fadd v16.4s, v16.4s, v2.4s\n"
-
-        "ldr q11, [inptr11], #0x10\n"
-        "prfm pldl1keep, [inptr3, #196]\n"
-        "fsub v17.4s, v17.4s, v3.4s\n"
-
-        "ldr q12, [inptr12], #0x10\n"
-        "prfm pldl1keep, [inptr4, #196]\n"
-        "str q16, [%x[outptr]], #0x10\n"
-
-        "ldr q13, [inptr13], #0x10\n"
-        "prfm pldl1keep, [inptr5, #196]\n"
-        "str q17, [outptr1], #0x10\n"
-
-        "ldr q14, [inptr14], #0x10\n"
-        "prfm pldl1keep, [inptr6, #196]\n"
-        "fadd v16.4s, v4.4s, v5.4s\n"
-
-        "ldr q15, [inptr15], #0x10\n"
-        "prfm pldl1keep, [inptr7, #196]\n"
-        "fsub v17.4s, v5.4s, v6.4s\n"
-
-        "prfm pldl1keep, [inptr8, #196]\n"
-        "prfm pldl1keep, [inptr9, #196]\n"
-        "fadd v16.4s, v16.4s, v6.4s\n"
-
-        "prfm pldl1keep, [inptr10, #196]\n"
-        "prfm pldl1keep, [inptr11, #196]\n"
-        "fsub v17.4s, v17.4s, v7.4s\n"
-
-        "prfm pldl1keep, [inptr12, #196]\n"
-        "prfm pldl1keep, [inptr13, #196]\n"
-        "str q16, [outptr2], #0x10\n"
-
-        "prfm pldl1keep, [inptr14, #196]\n"
-        "prfm pldl1keep, [inptr15, #196]\n"
-        "str q17, [outptr3], #0x10\n"
-
-        "fadd v16.4s, v8.4s, v9.4s\n"
-        "fsub v17.4s, v9.4s, v10.4s\n"
-
-        "fadd v16.4s, v16.4s, v10.4s\n"
-        "fsub v17.4s, v17.4s, v11.4s\n"
-
-        "str q16, [outptr4], #0x10\n"
-        "fadd v16.4s, v12.4s, v13.4s\n"
-        "fsub v18.4s, v13.4s, v14.4s\n"
-
-        "str q17, [outptr5], #0x10\n"
-        "fadd v16.4s, v16.4s, v14.4s\n"
-        "fsub v18.4s, v18.4s, v15.4s\n"
-
-        "str q16, [outptr6], #0x10\n"
-        "str q18, [outptr7], #0x10\n"
-
-      "subs %x[row], %x[row], #0x1\n"
-      "bne 1b\n"
-
-    ".unreq inptr1\n"
-    ".unreq inptr2\n"
-    ".unreq inptr3\n"
-    ".unreq inptr4\n"
-    ".unreq inptr5\n"
-    ".unreq inptr6\n"
-    ".unreq inptr7\n"
-    ".unreq inptr8\n"
-    ".unreq inptr9\n"
-    ".unreq inptr10\n"
-    ".unreq inptr11\n"
-    ".unreq inptr12\n"
-    ".unreq inptr13\n"
-    ".unreq inptr14\n"
-    ".unreq inptr15\n"
-    ".unreq outptr1\n"
-    ".unreq outptr2\n"
-    ".unreq outptr3\n"
-    ".unreq outptr4\n"
-    ".unreq outptr5\n"
-    ".unreq outptr6\n"
-    ".unreq outptr7\n"
-
-    : [row] "+r" (row),
-      [inptr] "+r" (inptr),
-      [outptr] "+r" (outptr)
-    : [n_channels] "r" (n_channels),
-      [sizeof_float] "i" (sizeof(float))
-    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-      "q12", "q13", "q14", "q15", "q16", "q17", "x0", "x1", "x2", "x3", "x4",
-      "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
-      "x16", "x17", "x18", "x19", "x20", "x21", "x22", "cc", "memory"
-  );
-}
-
-/*****************************************************************************/
-// Compute ZFZ^T specializations
-
-template <>
-template <>
-inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::compute_zfzT<false, false, 0>(
-    const Tensor4DShape &output_shape,
-    float* const output, const float* const input
-) {
-  const int tile_M = output_shape.n_rows / 2;
-  const int tile_N = output_shape.n_cols / 2;
-  int batch = output_shape.n_batches;
-  float *outptr = output;
-  const float *inptr = input;
-
-  asm volatile (
-    // Compute input pointers
-    "inptr1 .req x0\n"
-    "inptr2 .req x1\n"
-    "inptr3 .req x2\n"
-    "inptr4 .req x3\n"
-    "inptr5 .req x4\n"
-    "inptr6 .req x5\n"
-    "inptr7 .req x6\n"
-    "inptr8 .req x7\n"
-
-    "mstride .req x8\n"
-    "mul mstride, %x[tile_M], %x[tile_N]\n"
-    "mul mstride, mstride, %x[n_channels]\n"
-    "lsl mstride, mstride, #2\n"  // * sizeof(float)
-
-    "add inptr1, %[inptr], mstride\n"
-    "add inptr2, inptr1, mstride\n"
-    "add inptr3, inptr2, mstride\n"
-    "add inptr4, inptr3, mstride\n"
-    "add inptr5, inptr4, mstride\n"
-    "add inptr6, inptr5, mstride\n"
-    "add inptr7, inptr6, mstride\n"
-    "add inptr8, inptr7, mstride\n"
-
-    ".unreq mstride\n"
-
-    // Compute initial output pointers
-    "outptr01 .req  x8\n"
-    "outptr10 .req  x9\n"
-    "outptr11 .req x10\n"
-
-    "add outptr01, %x[outptr], %x[n_channels], LSL #2\n"
-    "add outptr10, %x[outptr], %x[row_stride], LSL #2\n"
-    "add outptr11,   outptr10, %x[n_channels], LSL #2\n"
-
-    "tile_i  .req x11\n"
-    "tile_j  .req x12\n"
-    "channel .req x13\n"
-
-    "1:"  // Loop over batches
-      "mov tile_i, %x[tile_M]\n"
-
-      "2:"  // Loop over rows of output tiles
-        "mov tile_j, %x[tile_N]\n"
-
-        "3:"  // Loop over columns of output tiles
-          "ldr q0, [%x[inptr]], #0x10\n"
-          "ldr q2, [inptr2], #0x10\n"
-          "subs channel, %x[n_channels], #0x4\n"
-
-          "ldr q1, [inptr1], #0x10\n"
-          "ldr q3, [inptr3], #0x10\n"
-          "beq 6f\n"
-
-          "4:"
-            "ldr q4, [inptr4], #0x10\n"
-            "ldr q5, [inptr5], #0x10\n"
-            "fadd v16.4s, v0.4s, v2.4s\n"
-
-            "ldr q6, [inptr6], #0x10\n"
-            "ldr q7, [inptr7], #0x10\n"
-            "fadd v17.4s, v1.4s, v3.4s\n"
-
-            "ldr q8, [%x[inptr]], #0x10\n"
-            "ldr q10, [inptr2], #0x10\n"
-            "fadd v16.4s, v16.4s, v4.4s\n"
-
-            "ldr q9, [inptr1], #0x10\n"
-            "ldr q11, [inptr3], #0x10\n"
-            "fadd v17.4s, v17.4s, v5.4s\n"
-
-            "str q16, [%x[outptr]], #0x10\n"
-            "prfm pldl1strm, [%x[inptr], #196]\n"
-            "fsub v18.4s, v2.4s, v4.4s\n"
-
-            "str q17, [outptr01], #0x10\n"
-            "prfm pldl1strm, [inptr2, #196]\n"
-            "fsub v19.4s, v3.4s, v5.4s\n"
-
-            "prfm pldl1strm, [inptr1, #196]\n"
-            "prfm pldl1strm, [inptr3, #196]\n"
-            "fsub v18.4s, v18.4s, v6.4s\n"
-
-            "prfm pldl1strm, [inptr4, #196]\n"
-            "prfm pldl1strm, [inptr5, #196]\n"
-            "fsub v19.4s, v19.4s, v7.4s\n"
-
-            "str q18, [outptr10], #0x10\n"
-            "prfm pldl1strm, [inptr6, #196]\n"
-            "prfm pldl1strm, [inptr7, #196]\n"
-
-            "subs channel, channel, #0x4\n"
-
-            "str q19, [outptr11], #0x10\n"
-            "beq 6f\n"  // Branch to tail
-
-            "ldr q12, [inptr4], #0x10\n"
-            "ldr q13, [inptr5], #0x10\n"
-            "fadd v16.4s, v8.4s, v10.4s\n"
-
-            "ldr q14, [inptr6], #0x10\n"
-            "ldr q15, [inptr7], #0x10\n"
-            "fadd v17.4s, v9.4s, v11.4s\n"
-
-            "ldr q0, [%x[inptr]], #0x10\n"
-            "ldr q2, [inptr2], #0x10\n"
-            "fadd v16.4s, v16.4s, v12.4s\n"
-
-            "ldr q1, [inptr1], #0x10\n"
-            "ldr q3, [inptr3], #0x10\n"
-            "fadd v17.4s, v17.4s, v13.4s\n"
-
-            "str q16, [%x[outptr]], #0x10\n"
-            "prfm pldl1strm, [%x[inptr], #196]\n"
-            "fsub v18.4s, v10.4s, v12.4s\n"
-
-            "str q17, [outptr01], #0x10\n"
-            "prfm pldl1strm, [inptr2, #196]\n"
-            "fsub v19.4s, v11.4s, v13.4s\n"
-
-            "prfm pldl1strm, [inptr1, #196]\n"
-            "prfm pldl1strm, [inptr3, #196]\n"
-            "fsub v18.4s, v18.4s, v14.4s\n"
-
-            "prfm pldl1strm, [inptr4, #196]\n"
-            "prfm pldl1strm, [inptr5, #196]\n"
-            "fsub v19.4s, v19.4s, v15.4s\n"
-
-            "str q18, [outptr10], #0x10\n"
-            "prfm pldl1strm, [inptr6, #196]\n"
-            "prfm pldl1strm, [inptr7, #196]\n"
-
-            "subs channel, channel, #0x4\n"
-
-            "str q19, [outptr11], #0x10\n"
-            "bne 4b\n"  // Continue loop
-
-          "5:"  // Tail
-            "ldr q12, [inptr4], #0x10\n"
-            "ldr q13, [inptr5], #0x10\n"
-            "fadd v16.4s, v8.4s, v10.4s\n"
-
-            "ldr q14, [inptr6], #0x10\n"
-            "ldr q15, [inptr7], #0x10\n"
-            "fadd v17.4s, v9.4s, v11.4s\n"
-
-            "fadd v16.4s, v16.4s, v12.4s\n"
-
-            "fadd v17.4s, v17.4s, v13.4s\n"
-
-            "str q16, [%x[outptr]], #0x10\n"
-            "fsub v18.4s, v10.4s, v12.4s\n"
-            "fsub v19.4s, v11.4s, v13.4s\n"
-
-            "str q17, [outptr01], #0x10\n"
-            "fsub v18.4s, v18.4s, v14.4s\n"
-            "fsub v19.4s, v19.4s, v15.4s\n"
-
-            "str q18, [outptr10], #0x10\n"
-            "str q19, [outptr11], #0x10\n"
-            "b 7f\n"
-
-          "6:"  // Tail
-            "ldr q4, [inptr4], #0x10\n"
-            "ldr q5, [inptr5], #0x10\n"
-            "fadd v16.4s, v0.4s, v2.4s\n"
-
-            "ldr q6, [inptr6], #0x10\n"
-            "ldr q7, [inptr7], #0x10\n"
-            "fadd v17.4s, v1.4s, v3.4s\n"
-
-            "fadd v16.4s, v16.4s, v4.4s\n"
-
-            "fadd v17.4s, v17.4s, v5.4s\n"
-
-            "str q16, [%x[outptr]], #0x10\n"
-            "fsub v18.4s, v2.4s, v4.4s\n"
-            "fsub v19.4s, v3.4s, v5.4s\n"
-
-            "str q17, [outptr01], #0x10\n"
-            "fsub v18.4s, v18.4s, v6.4s\n"
-            "fsub v19.4s, v19.4s, v7.4s\n"
-
-            "str q18, [outptr10], #0x10\n"
-            "str q19, [outptr11], #0x10\n"
-
-          "7:"
-            "add %x[outptr], %x[outptr], %x[n_channels], LSL #2\n"
-            "add outptr01, outptr01, %x[n_channels], LSL #2\n"
-            "add outptr10, outptr10, %x[n_channels], LSL #2\n"
-            "add outptr11, outptr11, %x[n_channels], LSL #2\n"
-
-            "subs tile_j, tile_j, #1\n"
-            "bne 3b\n"
-
-        // Progress the output pointers to the new row
-        "add %x[outptr], %x[outptr], %x[row_stride], LSL #2\n"
-        "add   outptr01,   outptr01, %x[row_stride], LSL #2\n"
-        "add   outptr10,   outptr10, %x[row_stride], LSL #2\n"
-        "add   outptr11,   outptr11, %x[row_stride], LSL #2\n"
-
-        "subs tile_i, tile_i, #1\n"
-        "bne 2b\n"
-
-      "subs %[batch], %[batch], #1\n"
-      "bne 1b\n"
-      "5:"
-
-    ".unreq inptr1\n"
-    ".unreq inptr2\n"
-    ".unreq inptr3\n"
-    ".unreq inptr4\n"
-    ".unreq inptr5\n"
-    ".unreq inptr6\n"
-    ".unreq inptr7\n"
-    ".unreq inptr8\n"
-    ".unreq outptr01\n"
-    ".unreq outptr10\n"
-    ".unreq outptr11\n"
-    : [batch] "+r" (batch),
-      [outptr] "+r" (outptr),
-      [inptr] "+r" (inptr)
-    : [tile_M] "r" (tile_M),
-      [tile_N] "r" (tile_N),
-      [n_channels] "r" (output_shape.n_channels),
-      [row_stride] "r" (output_shape.n_cols * output_shape.n_channels)
-    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11",
-      "x12", "x13", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-      "cc", "memory"
-  );
-}
-/*****************************************************************************/
-
-/*****************************************************************************/
-template <>
-inline void winograd::Winograd2x2_3x3GemmOutput_TwoStage<float>::execute(
-    const Tensor4DShape &output_shape,
-    float* const matrices[16], float* const output
-) {
-  // profiler prof;
-
-  // Allocate memory for the intermediate matrices
-  const int tile_M = iceildiv(output_shape.n_rows, 2);
-  const int tile_N = iceildiv(output_shape.n_cols, 2);
-  const int n_rows = output_shape.n_batches * tile_M * tile_N;
-  const int n_channels = output_shape.n_channels;
-  float* matrices_zf = reinterpret_cast<float*>(
-    calloc(8 * n_rows * n_channels, sizeof(float))
-  );
-  
-  // Perform the first stage transform, computing ZF.
-  const auto f_compute_zf = [&] () {
-    switch (n_channels % 4) {
-      case 0:
-        compute_zf<0>(n_rows, n_channels, matrices_zf, matrices);
-        break;
-      case 1:
-        compute_zf<1>(n_rows, n_channels, matrices_zf, matrices);
-        break;
-      case 2:
-        compute_zf<2>(n_rows, n_channels, matrices_zf, matrices);
-        break;
-      case 3:
-        compute_zf<3>(n_rows, n_channels, matrices_zf, matrices);
-    };
-  };
-  // prof("Compute ZF", f_compute_zf, 16 * n_rows * n_channels * sizeof(float), 0, 8 * n_rows * n_channels * sizeof(float));
-  f_compute_zf();
-  
-  // Perform the second stage transform, finishing Z F Z^T - variable dispatch
-  // based on size of the output and the channel tail.
-  const auto f_compute_zfzT = [&] () {
-    if (output_shape.n_rows % 2 && output_shape.n_cols % 2) {
-      constexpr bool tail_M = true, tail_N = true;
-      switch (n_channels % 4) {
-        case 0:
-          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
-          break;
-        case 1:
-          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
-          break;
-        case 2:
-          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
-          break;
-        case 3:
-          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
-      }
-    } else if (output_shape.n_rows % 2) {
-      constexpr bool tail_M = true, tail_N = false;
-      switch (n_channels % 4) {
-        case 0:
-          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
-          break;
-        case 1:
-          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
-          break;
-        case 2:
-          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
-          break;
-        case 3:
-          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
-      }
-    } else if (output_shape.n_cols % 2) {
-      constexpr bool tail_M = false, tail_N = true;
-      switch (n_channels % 4) {
-        case 0:
-          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
-          break;
-        case 1:
-          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
-          break;
-        case 2:
-          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
-          break;
-        case 3:
-          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
-      }
-    } else {
-      constexpr bool tail_M = false, tail_N = false;
-      switch (n_channels % 4) {
-        case 0:
-          compute_zfzT<tail_M, tail_N, 0>(output_shape, output, matrices_zf);
-          break;
-        case 1:
-          compute_zfzT<tail_M, tail_N, 1>(output_shape, output, matrices_zf);
-          break;
-        case 2:
-          compute_zfzT<tail_M, tail_N, 2>(output_shape, output, matrices_zf);
-          break;
-        case 3:
-          compute_zfzT<tail_M, tail_N, 3>(output_shape, output, matrices_zf);
-      }
-    }
-  };
-  // prof("Compute ZFZT", f_compute_zfzT, 8 * n_rows * n_channels * sizeof(float), 0, 4 * n_rows * n_channels * sizeof(float));
-  f_compute_zfzT();
-
-  free(reinterpret_cast<void*>(matrices_zf));
-}
-/*****************************************************************************/
-
-#endif  // __aarch64__

diff --git a/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp
new file mode 100644
index 0000000..e7907d1
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/output_2x2_3x3_fp32.cpp

@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "transforms/output.hpp"
+#include "winograd_gemm.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
+
+template <>
+template <>
+int Transform::ops_performed(const Tensor4DShape &shape)
+{
+  // NOTE: Cost in FLOPs rather than instructions or uops.
+  const int tile_M = iceildiv(shape.n_rows, 2);
+  const int tile_N = iceildiv(shape.n_cols, 2);
+  return 24 * tile_M * tile_N * shape.n_channels;
+}
+
+/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use
+ * enough tiles to cover the output space each output tile may contain 0 or 1
+ * padded values to the right and bottom columns or rows of the tile, e.g.:
+ *
+ *      ___     ___
+ *     |   |   |  X|
+ *     |___|   |__X|
+ *
+ *      ___     ___
+ *     |   |   |  X|
+ *     |X_X|   |X_X|
+ *
+ *
+ * We provide a specialised output transform for each of these instances.
+ * Consequently we below construct an array of the various padding options, the
+ * array contains pointers to the specific implementations.
+ */
+template <>
+template <>
+template <int pad_bottom, int pad_right>
+void Transform::process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  constexpr int cells_i = 2 - pad_bottom;
+  constexpr int cells_j = 2 - pad_right;
+
+  // Construct a map to the output cells
+  float *outptrs[cells_i][cells_j];
+  for (int i = 0; i < cells_i; i++)
+  {
+    for (int j = 0; j < cells_j; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+  const float *inptr = matrix_base;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[4][4], FZ[4][2], f[2][2];
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vaddq_f32(vaddq_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsubq_f32(vsubq_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsubq_f32(vsubq_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1q_f32(outptrs[i][j], f[i][j]);
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[4][4], FZ[4][2], f[2][2];
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      // FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][0] = vadd_f32(vadd_f32(F[i][0], F[i][1]), F[i][2]);
+
+      // FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+      FZ[i][1] = vsub_f32(vsub_f32(F[i][1], F[i][2]), F[i][3]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      // f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[0][j] = vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), FZ[2][j]);
+
+      // f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+      f[1][j] = vsub_f32(vsub_f32(FZ[1][j], FZ[2][j]), FZ[3][j]);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1_f32(outptrs[i][j], f[i][j]);
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif  // __arm_any__
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[4][4], FZ[4][2], f[2][2];
+
+    // Read a 4x4 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 4; i++)
+    {
+      for (int j = 0; j < 4; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 4; i++)
+    {
+      FZ[i][0] =  F[i][0] + F[i][1] + F[i][2];
+      FZ[i][1] =  F[i][1] - F[i][2] - F[i][3];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 2; j++)
+    {
+      f[0][j] =  FZ[0][j] + FZ[1][j] + FZ[2][j];
+      f[1][j] =  FZ[1][j] - FZ[2][j] - FZ[3][j];
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j];
+      }
+    }
+  }
+}
+
+template <>
+template <>
+const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+{
+  {
+    Transform::template process_tile<0, 0>,  // No padding
+    Transform::template process_tile<0, 1>,  // Right padding
+  },
+  {
+    Transform::template process_tile<1, 0>,  // Bottom padding
+    Transform::template process_tile<1, 1>,  // Bottom and right padding
+  }
+};
+
+template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp
new file mode 100644
index 0000000..483e5c1
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/output_4x4_3x3_fp32.cpp

@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "transforms/output.hpp"
+#include "winograd_gemm.hpp"
+#include "arm.hpp"
+
+namespace winograd
+{
+
+using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
+
+template <>
+template <>
+int Transform::ops_performed(const Tensor4DShape &shape)
+{
+  // NOTE: Cost in FLOPs rather than instructions or uops.
+  const int tile_M = iceildiv(shape.n_rows, 4);
+  const int tile_N = iceildiv(shape.n_cols, 4);
+  return 170 * tile_M * tile_N * shape.n_channels;
+}
+
+// Instantiate cost methods
+template int Transform::ops_performed(const Tensor4DShape&);
+
+/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use
+ * enough tiles to cover the output space each output tile may contain up to 3
+ * padded values to the right and bottom columns or rows of the tile, e.g.:
+*
+*      ________    ________   ________   ________
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |_______|   |______X|  |____X_X|  |__X_X_X|
+*
+*      ________    ________   ________   ________
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
+*
+*      ________    ________   ________   ________
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
+*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
+*
+*      ________    ________   ________   ________
+*     |       |   |      X|  |    X X|  |  X X X|
+*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
+*     |X X X X|   |X X X X|  |X X X X|  |X X X X|
+*     |X_X_X_X|   |X_X_X_X|  |X_X_X_X|  |X_X_X_X|
+*
+*
+* We provide a specialised output transform for each of these instances.
+*/
+template <>
+template <>
+template <int pad_bottom, int pad_right>
+void Transform::process_tile(
+  const int n_channels,
+  const float* const matrix_base,
+  const int matrix_stride,
+  float* const output,
+  const int output_row_stride,
+  const int output_col_stride
+)
+{
+  constexpr int cells_i = 4 - pad_bottom;
+  constexpr int cells_j = 4 - pad_right;
+
+  // Construct a map to the output cells
+  float *outptrs[cells_i][cells_j];
+  for (int i = 0; i < cells_i; i++)
+  {
+    for (int j = 0; j < cells_j; j++)
+    {
+      outptrs[i][j] = output + i*output_row_stride + j*output_col_stride;
+    }
+  }
+  const float *inptr = matrix_base;
+
+  // For each channel of the output
+  int channels_remaining = n_channels;
+#ifdef __aarch64__
+  for (; channels_remaining >= 4; channels_remaining -= 4)
+  {
+    // Matrices used and computed during this transform
+    float32x4_t F[6][6], FZ[6][4], f[4][4];
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1q_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 4;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vaddq_f32(vaddq_f32(vaddq_f32(F[i][0], F[i][1]), vaddq_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 2.0f);
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vmlaq_n_f32(vaddq_f32(F[i][1], F[i][2]), vaddq_f32(F[i][3], F[i][4]), 4.0f);
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vaddq_f32(vmlaq_n_f32(vsubq_f32(F[i][1], F[i][2]), vsubq_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vaddq_f32(vaddq_f32(vaddq_f32(FZ[0][j], FZ[1][j]), vaddq_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vmlaq_n_f32(vaddq_f32(FZ[1][j], FZ[2][j]), vaddq_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vaddq_f32(vmlaq_n_f32(vsubq_f32(FZ[1][j], FZ[2][j]), vsubq_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1q_f32(outptrs[i][j], f[i][j]);
+        outptrs[i][j] += 4;
+      }
+    }
+  }
+#endif  // __aarch64__
+#ifdef __arm_any__
+  for (; channels_remaining >= 2; channels_remaining -= 2)
+  {
+    // Matrices used and computed during this transform
+    float32x2_t F[6][6], FZ[6][4], f[4][4];
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = vld1_f32(inptr + m*matrix_stride);
+      }
+    }
+    inptr += 2;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      // FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][0] = vadd_f32(vadd_f32(vadd_f32(F[i][0], F[i][1]), vadd_f32(F[i][2], F[i][3])), F[i][4]);
+
+      // FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][1] = vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 2.0f);
+
+      // FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][2] = vmla_n_f32(vadd_f32(F[i][1], F[i][2]), vadd_f32(F[i][3], F[i][4]), 4.0f);
+
+      // FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+      FZ[i][3] = vadd_f32(vmla_n_f32(vsub_f32(F[i][1], F[i][2]), vsub_f32(F[i][3], F[i][4]), 8.0f), F[i][5]);
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      // f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[0][j] = vadd_f32(vadd_f32(vadd_f32(FZ[0][j], FZ[1][j]), vadd_f32(FZ[2][j], FZ[3][j])), FZ[4][j]);
+
+      // f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[1][j] = vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 2.0f);
+
+      // f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[2][j] = vmla_n_f32(vadd_f32(FZ[1][j], FZ[2][j]), vadd_f32(FZ[3][j], FZ[4][j]), 4.0f);
+
+      // f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+      f[3][j] = vadd_f32(vmla_n_f32(vsub_f32(FZ[1][j], FZ[2][j]), vsub_f32(FZ[3][j], FZ[4][j]), 8.0f), FZ[5][j]);
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        vst1_f32(outptrs[i][j], f[i][j]);
+        outptrs[i][j] += 2;
+      }
+    }
+  }
+#endif
+  for (; channels_remaining; channels_remaining--)
+  {
+    // Matrices used and computed during this transform
+    float F[6][6], FZ[6][4], f[4][4];
+
+    // Read a 6x6 tile in the Winograd domain
+    for (int i = 0, m = 0; i < 6; i++)
+    {
+      for (int j = 0; j < 6; j++, m++)
+      {
+        F[i][j] = *(inptr + m*matrix_stride);
+      }
+    }
+    inptr++;
+
+    // Compute the matrix F Z
+    for (int i = 0; i < 6; i++)
+    {
+      FZ[i][0] =  1*F[i][0] +  1*F[i][1] +  1*F[i][2] +  1*F[i][3] +  1*F[i][4];
+      FZ[i][1] =  1*F[i][1] + -1*F[i][2] +  2*F[i][3] + -2*F[i][4];
+      FZ[i][2] =  1*F[i][1] +  1*F[i][2] +  4*F[i][3] +  4*F[i][4];
+      FZ[i][3] =  1*F[i][1] + -1*F[i][2] +  8*F[i][3] + -8*F[i][4] +  1*F[i][5];
+    }
+
+    // Compute the output tile f = ZT F Z
+    for (int j = 0; j < 4; j++)
+    {
+      f[0][j] =  1*FZ[0][j] +  1*FZ[1][j] +  1*FZ[2][j] +  1*FZ[3][j] +  1*FZ[4][j];
+      f[1][j] =  1*FZ[1][j] + -1*FZ[2][j] +  2*FZ[3][j] + -2*FZ[4][j];
+      f[2][j] =  1*FZ[1][j] +  1*FZ[2][j] +  4*FZ[3][j] +  4*FZ[4][j];
+      f[3][j] =  1*FZ[1][j] + -1*FZ[2][j] +  8*FZ[3][j] + -8*FZ[4][j] +  1*FZ[5][j];
+    }
+
+    // Write out the output tile
+    for (int i = 0; i < cells_i; i++)
+    {
+      for (int j = 0; j < cells_j; j++)
+      {
+        *(outptrs[i][j]++) = f[i][j];
+      }
+    }
+  }
+}
+
+template <>
+template <>
+const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] =
+{
+  {
+    Transform::template process_tile<0, 0>,
+    Transform::template process_tile<0, 1>,
+    Transform::template process_tile<0, 2>,
+    Transform::template process_tile<0, 3>,
+  },
+  {
+    Transform::template process_tile<1, 0>,
+    Transform::template process_tile<1, 1>,
+    Transform::template process_tile<1, 2>,
+    Transform::template process_tile<1, 3>,
+  },
+  {
+    Transform::template process_tile<2, 0>,
+    Transform::template process_tile<2, 1>,
+    Transform::template process_tile<2, 2>,
+    Transform::template process_tile<2, 3>,
+  },
+  {
+    Transform::template process_tile<3, 0>,
+    Transform::template process_tile<3, 1>,
+    Transform::template process_tile<3, 2>,
+    Transform::template process_tile<3, 3>,
+  }
+};
+
+template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp
new file mode 100644
index 0000000..c0b2824
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/weights_2x2_3x3_fp32.cpp

@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "winograd_gemm.hpp"
+#include "transforms/kernel.hpp"
+
+namespace winograd
+{
+  template <>
+  template <>
+  void WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    constexpr int inner_tile_i = 4;
+    constexpr int inner_tile_j = 4;
+
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const auto weight_row_stride = 3 * weight_col_stride;
+    const float *inptrs[3][3];
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+      }
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+      for (; channels_remaining >= 4; channels_remaining -= 4)
+      {
+        // Matrices used and computed in this kernel
+        float32x4_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = vld1q_f32(inptrs[i][j]);
+            inptrs[i][j] += 4;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          Ww[0][j] = w[0][j];
+
+          // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+          Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+          // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+          Ww[2][j] = vmulq_n_f32(vaddq_f32(vsubq_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+          Ww[3][j] = w[2][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < inner_tile_i; i++)
+        {
+          V[i][0] = Ww[i][0];
+
+          // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+          V[i][1] = vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+          // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+          V[i][2] = vmulq_n_f32(vaddq_f32(vsubq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+          V[i][3] = Ww[i][2];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < inner_tile_i; i++)
+        {
+          for (int j = 0; j < inner_tile_j; j++, m++)
+          {
+            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 4;
+      }
+#endif  // __aarch64__
+#ifdef __arm_any__
+      for (; channels_remaining >= 2; channels_remaining -= 2)
+      {
+        // Matrices used and computed in this kernel
+        float32x2_t w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = vld1_f32(inptrs[i][j]);
+            inptrs[i][j] += 2;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          Ww[0][j] = w[0][j];
+
+          // Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+          Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+          // Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+          Ww[2][j] = vmul_n_f32(vadd_f32(vsub_f32(w[0][j], w[1][j]), w[2][j]), 0.5f);
+
+          Ww[3][j] = w[2][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < inner_tile_i; i++)
+        {
+          V[i][0] = Ww[i][0];
+
+          // V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+          V[i][1] = vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+          // V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+          V[i][2] = vmul_n_f32(vadd_f32(vsub_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), 0.5f);
+
+          V[i][3] = Ww[i][2];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < inner_tile_i; i++)
+        {
+          for (int j = 0; j < inner_tile_j; j++, m++)
+          {
+            vst1_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 2;
+      }
+#endif  // __arm_any__
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[3][3], Ww[inner_tile_i][3], V[inner_tile_i][inner_tile_j];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = *(inptrs[i][j]++);
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          Ww[0][j] = w[0][j];
+          Ww[1][j] = 0.5*(w[0][j] + w[1][j] + w[2][j]);
+          Ww[2][j] = 0.5*(w[0][j] - w[1][j] + w[2][j]);
+          Ww[3][j] = w[2][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < inner_tile_i; i++)
+        {
+          V[i][0] = Ww[i][0];
+          V[i][1] = 0.5*(Ww[i][0] + Ww[i][1] + Ww[i][2]);
+          V[i][2] = 0.5*(Ww[i][0] - Ww[i][1] + Ww[i][2]);
+          V[i][3] = Ww[i][2];
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < inner_tile_i; i++)
+        {
+          for (int j = 0; j < inner_tile_j; j++, m++)
+          {
+            *(outptr + m*matrix_stride) = V[i][j];
+          }
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    const int channel_prod = shape.n_input_channels * shape.n_output_channels;
+    return 2 * 18 * channel_prod;
+  }
+
+  template struct WinogradGEMM<2, 2, 3, 3>::WeightsTransform<float>;
+}  // namespace winograd

diff --git a/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp
new file mode 100644
index 0000000..de659c3
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/transforms/weights_4x4_3x3_fp32.cpp

@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm.hpp"
+#include "winograd_gemm.hpp"
+#include "transforms/kernel.hpp"
+
+namespace winograd
+{
+  /* Float implementation for kernel transform F(4x4, 3x3) */
+  template <>
+  template <>
+  void WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::execute(
+    const int n_output_channels,
+    const int n_input_channels,
+    const float* const input,  // NOTE: Data in HWIO order
+    float* const output,
+    const int matrix_stride,
+    const int matrix_row_stride
+  )
+  {
+    // Get pointers to each cell of the weight tensor
+    const auto weight_col_stride = n_input_channels * n_output_channels;
+    const auto weight_row_stride = 3 * weight_col_stride;
+    const float *inptrs[3][3];
+    for (int i = 0; i < 3; i++)
+    {
+      for (int j = 0; j < 3; j++)
+      {
+        inptrs[i][j] = input + i*weight_row_stride + j*weight_col_stride;
+      }
+    }
+
+    // For each input channel
+    for (int ic = 0; ic < n_input_channels; ic++)
+    {
+      float *outptr = output + ic * matrix_row_stride;
+
+      // For each output channel
+      int channels_remaining = n_output_channels;
+#ifdef __aarch64__
+      for (; channels_remaining >= 4; channels_remaining -= 4)
+      {
+        // Matrices used and computed in this kernel
+        float32x4_t w[3][3], Ww[6][3], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = vld1q_f32(inptrs[i][j]);
+            inptrs[i][j] += 4;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          // Ww[0][j] =  6*w[0][j];
+          Ww[0][j] = vmulq_n_f32(w[0][j], 6.0);
+
+          // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+          Ww[1][j] = vmulq_n_f32(vaddq_f32(vaddq_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+          // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+          Ww[2][j] = vmulq_n_f32(vsubq_f32(vsubq_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+          // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+          Ww[3][j] = vmlaq_n_f32(vmlaq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+          // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+          Ww[4][j] = vmlaq_n_f32(vmlsq_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+          // Ww[5][j] = 24*w[2][j];
+          Ww[5][j] = vmulq_n_f32(w[2][j], 24.0f);
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          const float recip576 = 1.0f / 576.0f;
+
+          // V[i][0] =  6*Ww[i][0];
+          V[i][0] = vmulq_n_f32(vmulq_n_f32(Ww[i][0], 6.0), recip576);
+
+          // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+          V[i][1] = vmulq_n_f32(vmulq_n_f32(vaddq_f32(vaddq_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+          // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+          V[i][2] = vmulq_n_f32(vmulq_n_f32(vsubq_f32(vsubq_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+          // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+          V[i][3] = vmulq_n_f32(vmlaq_n_f32(vmlaq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+          // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+          V[i][4] = vmulq_n_f32(vmlaq_n_f32(vmlsq_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+          // V[i][5] = 24*Ww[i][2];
+          V[i][5] = vmulq_n_f32(vmulq_n_f32(Ww[i][2], 24.0f), recip576);
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            vst1q_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 4;
+      }
+#endif  // __aarch64__
+#ifdef __arm_any__
+      for (; channels_remaining >= 2; channels_remaining -= 2)
+      {
+        // Matrices used and computed in this kernel
+        float32x2_t w[3][3], Ww[6][3], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = vld1_f32(inptrs[i][j]);
+            inptrs[i][j] += 2;
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          // Ww[0][j] =  6*w[0][j];
+          Ww[0][j] = vmul_n_f32(w[0][j], 6.0);
+
+          // Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+          Ww[1][j] = vmul_n_f32(vadd_f32(vadd_f32(w[0][j], w[1][j]), w[2][j]), -4.0);
+
+          // Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+          Ww[2][j] = vmul_n_f32(vsub_f32(vsub_f32(w[1][j], w[0][j]), w[2][j]), 4.0);
+
+          // Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+          Ww[3][j] = vmla_n_f32(vmla_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+          // Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+          Ww[4][j] = vmla_n_f32(vmls_n_f32(w[0][j], w[1][j], 2.0f), w[2][j], 4.0f);
+
+          // Ww[5][j] = 24*w[2][j];
+          Ww[5][j] = vmul_n_f32(w[2][j], 24.0f);
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          const float recip576 = 1.0f / 576.0f;
+
+          // V[i][0] =  6*Ww[i][0];
+          V[i][0] = vmul_n_f32(vmul_n_f32(Ww[i][0], 6.0), recip576);
+
+          // V[i][1] = -4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2];
+          V[i][1] = vmul_n_f32(vmul_n_f32(vadd_f32(vadd_f32(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576);
+
+          // V[i][2] = -4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2];
+          V[i][2] = vmul_n_f32(vmul_n_f32(vsub_f32(vsub_f32(Ww[i][1], Ww[i][0]), Ww[i][2]), 4.0), recip576);
+
+          // V[i][3] =  1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2];
+          V[i][3] = vmul_n_f32(vmla_n_f32(vmla_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+          // V[i][4] =  1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2];
+          V[i][4] = vmul_n_f32(vmla_n_f32(vmls_n_f32(Ww[i][0], Ww[i][1], 2.0f), Ww[i][2], 4.0f), recip576);
+
+          // V[i][5] = 24*Ww[i][2];
+          V[i][5] = vmul_n_f32(vmul_n_f32(Ww[i][2], 24.0f), recip576);
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            vst1_f32(outptr + m*matrix_stride, V[i][j]);
+          }
+        }
+        outptr += 2;
+      }
+#endif  // __arm_any__
+      for (; channels_remaining; channels_remaining--)
+      {
+        // Matrices used and computed in this kernel
+        float w[3][3], Ww[6][3], V[6][6];
+
+        // Read weights
+        for (int i = 0; i < 3; i++)
+        {
+          for (int j = 0; j < 3; j++)
+          {
+            w[i][j] = *(inptrs[i][j]++);
+          }
+        }
+
+        // Compute the matrix W w
+        for (int j = 0; j < 3; j++)
+        {
+          Ww[0][j] =  6*w[0][j];
+          Ww[1][j] = -4*w[0][j] + -4*w[1][j] + -4*w[2][j];
+          Ww[2][j] = -4*w[0][j] +  4*w[1][j] + -4*w[2][j];
+          Ww[3][j] =  1*w[0][j] +  2*w[1][j] +  4*w[2][j];
+          Ww[4][j] =  1*w[0][j] + -2*w[1][j] +  4*w[2][j];
+          Ww[5][j] = 24*w[2][j];
+        }
+
+        // Compute V = W w WT
+        for (int i = 0; i < 6; i++)
+        {
+          V[i][0] = ( 6*Ww[i][0]) / 576.0;
+          V[i][1] = (-4*Ww[i][0] + -4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+          V[i][2] = (-4*Ww[i][0] +  4*Ww[i][1] + -4*Ww[i][2]) / 576.0;
+          V[i][3] = ( 1*Ww[i][0] +  2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+          V[i][4] = ( 1*Ww[i][0] + -2*Ww[i][1] +  4*Ww[i][2]) / 576.0;
+          V[i][5] = (24*Ww[i][2]) / 576.0;
+        }
+
+        // Store the transformed weights
+        for (int i = 0, m = 0; i < 6; i++)
+        {
+          for (int j = 0; j < 6; j++, m++)
+          {
+            *(outptr + m*matrix_stride) = V[i][j];
+          }
+        }
+        outptr++;
+      }
+    }
+  }
+
+  template <>
+  template <>
+  int WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>::ops_performed(const KernelShape &shape)
+  {
+    const int channel_prod = shape.n_input_channels * shape.n_output_channels;
+    return 9 * 16 * channel_prod;
+  }
+
+  template struct WinogradGEMM<4, 4, 3, 3>::WeightsTransform<float>;
+}

diff --git a/src/core/NEON/kernels/winograd/utils.hpp b/src/core/NEON/kernels/winograd/utils.cpp
similarity index 76%
rename from src/core/NEON/kernels/winograd/utils.hpp
rename to src/core/NEON/kernels/winograd/utils.cpp
index 14e709f..24d0386 100644
--- a/src/core/NEON/kernels/winograd/utils.hpp
+++ b/src/core/NEON/kernels/winograd/utils.cpp

@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2017 ARM Limited.
  *
@@ -22,31 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
+
+#include <cstdio>
 #include <ctime>
 
-inline double TimeInUs(void) {
+double TimeInUs(void)
+{
 #ifdef CYCLE_PROFILING
   timespec t;
-  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
+  clock_gettime(CLOCK_REALTIME, &t);
   return 1e6*t.tv_sec + 1e-3*t.tv_nsec;
 #else
   return 0;
 #endif
 }
 
-inline int iceildiv(const int a, const int b) {
-  return (a + b - 1) / b;
-}
-
-template <typename T>
-inline T roundup(const T a, const T b) {
-  return a + b - (a % b);
-}
-
-inline void PrintMatrix(const float* const m, const int M, const int N, const int row_stride) {
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
+void PrintMatrix(const float* const m, const int M, const int N, const int row_stride)
+{
+  for (int i = 0; i < M; i++)
+  {
+    for (int j = 0; j < N; j++)
+    {
       printf("%.3f ", m[i*row_stride + j]);
     }
     printf("\n");

diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/winograd/winograd_gemm.cpp
new file mode 100644
index 0000000..b44a453
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/winograd_gemm.cpp

@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "winograd_gemm.hpp"
+#include "batched_blocked_gemm.hpp"
+using namespace winograd;
+
+/** Get the output shape of a convolution. */
+template <int kr, int kc, int itr, int itc>
+template <typename TOut, typename TIn>
+Tensor4DShape WinogradGEMM<kr, kc, itr, itc>::Convolution<TOut, TIn>::get_output_shape(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &in_shape,
+  const PaddingType padding
+)
+{
+  // TODO Accept different kernel sizes
+  return Tensor4DShape {
+    in_shape.n_batches,
+    (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2,
+    (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2,
+    kernel_shape.n_output_channels,
+    in_shape.ordering
+  };
+}
+
+/* Get the memory required to transform the kernel.
+ */
+template <int kernel_rows, int kernel_cols,
+          int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_transform_working_size(const KernelShape &shape)
+{
+  if (shape.ordering == HWIO)
+  {
+    // Kernel is already in the correct order, so no additional memory is
+    // required.
+    return 0;
+  }
+  else
+  {
+    // Need to re-order the kernel into HWIO form, require enough space to
+    // represent the tensor.
+    return sizeof(TIn) * shape.size();
+  }
+}
+
+/** Get the memory required to store the kernel transformed into the
+ * Winograd domain.
+ */
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_storage_size(const KernelShape &shape)
+{
+  return N_GEMMS * get_kernel_matrix_size(shape);
+}
+
+
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_storage_size(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding
+)
+{
+  return N_GEMMS * get_input_matrix_size(kernel_shape, input_shape, padding);
+}
+
+
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_storage_size(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding
+)
+{
+  return N_GEMMS * get_output_matrix_size(kernel_shape, input_shape, padding);
+}
+
+
+/** Get the memory required to apply a Winograd operator to some input.
+ */
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_working_space_size(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding_type
+)
+{
+  const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
+
+  // Get the memory required to store the matrices
+  const size_t matrix_sizes = N_GEMMS * (
+    get_input_matrix_size(kernel_shape, input_shape, padding_type) +
+    get_output_matrix_size(kernel_shape, input_shape, padding_type)
+  );
+
+  // Add additional space to re-order the input and output if the input tensor
+  // is not in NHWC format.
+  if (input_shape.ordering == NHWC)
+  {
+    return matrix_sizes;  // No extra spacing required
+  }
+  else  // NCHW, must reorder the input and output tensors
+  {
+    // We only need to re-order the input or output at any one time, so request
+    // enough memory to do the largest of these.
+    const size_t extra_memory = std::max(
+      sizeof(TIn) * input_shape.size(),
+      sizeof(TOut) * output_shape.size()
+    );
+    return matrix_sizes + extra_memory;
+  }
+}
+
+
+/* Get the memory required by a single "input" matrix.
+ */
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_size(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding_type
+)
+{
+  return get_input_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TIn);
+}
+
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_input_matrix_stride(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding_type
+)
+{
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
+  const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
+  const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
+  const int M = roundup(input_shape.n_batches * tile_rows * tile_cols, M_BLOCK);
+  const int K = kernel_shape.n_input_channels;
+
+  return M * K;
+}
+
+
+/* Get the memory required by a single "output" matrix.
+ */
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_size(
+    const KernelShape &kernel_shape,
+    const Tensor4DShape &input_shape,
+    const PaddingType padding_type
+)
+{
+  return get_output_matrix_stride(kernel_shape, input_shape, padding_type) * sizeof(TOut);
+}
+
+
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_output_matrix_stride(
+    const KernelShape &kernel_shape,
+    const Tensor4DShape &input_shape,
+    const PaddingType padding_type
+)
+{
+  // Compute shape for the GEMM
+  const auto output_shape = get_output_shape(kernel_shape, input_shape, padding_type);
+  const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows);
+  const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols);
+  const int M = roundup(tile_rows * tile_cols, M_BLOCK);
+  const int N = roundup(kernel_shape.n_output_channels, N_BLOCK);
+
+  return input_shape.n_batches * M * N;
+}
+
+
+/* Get the memory required by a single "kernel" matrix.
+ */
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+size_t WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_size(const KernelShape &shape)
+{
+  return sizeof(TIn) * get_kernel_matrix_stride(shape);
+}
+
+template <int kernel_rows, int kernel_cols, int output_tile_rows, int output_tile_cols>
+template <typename TOut, typename TIn>
+int WinogradGEMM<kernel_rows, kernel_cols, output_tile_rows, output_tile_cols>::Convolution<TOut, TIn>::get_kernel_matrix_stride(const KernelShape &shape)
+{
+  const int K = shape.n_input_channels;
+  const int N = roundup(shape.n_output_channels, N_BLOCK);
+  return K * N;
+}
+
+
+/** Create a new Winograd operator. */
+template <int output_tile_rows, int output_tile_cols,
+          int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding,
+  void *kernel_storage
+) : kernel_shape(kernel_shape),  // Store the kernel shape
+    kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)),
+    manage_kernel_storage(kernel_storage == NULL),
+    _kernel_storage(manage_kernel_storage ?
+                      ALLOCATE(get_kernel_storage_size(kernel_shape)) :
+                      kernel_storage),
+    input_shape(input_shape),
+    padding(padding),
+    output_shape(get_output_shape(kernel_shape, input_shape, padding)),
+    tile_rows(iceildiv(output_shape.n_rows, output_tile_rows)),
+    tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)),
+    M(input_shape.n_batches * tile_rows * tile_cols),
+    K(kernel_shape.n_input_channels),
+    N(kernel_shape.n_output_channels),
+    prof()
+{
+  // Create pointers to the kernel matrices
+  const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
+  int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage);
+  for (int i = 0; i < N_GEMMS; i++) {
+    kernel_matrices[i] = reinterpret_cast<TIn *>(
+      ks_bytes + i*kernel_matrix_size_bytes);
+  }
+}
+
+
+/** Create a new Winograd operator and initialise the weights. */
+template <int output_tile_rows, int output_tile_cols,
+          int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Convolution<TOut, TIn>::Convolution(
+  const KernelShape &kernel_shape,
+  const Tensor4DShape &input_shape,
+  const PaddingType padding,
+  const TIn* const kernel,
+  void *kernel_storage,
+  void *transform_working_space
+) : Convolution(kernel_shape, input_shape, padding, kernel_storage)
+{
+  transform_weights(kernel, transform_working_space);
+}
+
+
+/** Clean up a convolution engine. */
+template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
+Convolution<TOut, TIn>::~Convolution()
+{
+  // If we were responsible for managing kernel storage ensure that it is
+  // freed.
+  if (manage_kernel_storage)
+  {
+    free(_kernel_storage);
+  }
+}
+
+
+/** Transform weights into the Winograd domain and store them for later use/reuse. */
+template <int output_tile_rows, int output_tile_cols, int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+template <typename WeightsTransformT>
+void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
+Convolution<TOut, TIn>::transform_weights(
+  const TIn* const kernel,
+  void *transform_working_space
+)
+{
+  // Allocate working space if it is required
+  bool allocated_working_space = false;
+  if (transform_working_space == NULL &&  // If no memory has been provided
+      get_kernel_transform_working_size(kernel_shape) != 0)  // And we need the space
+  {
+    allocated_working_space = true;
+    transform_working_space = ALLOCATE(
+      get_kernel_transform_working_size(kernel_shape)
+    );
+  }
+
+  // The transformation methods only work on weights laid out in HWIO form, if
+  // the weights are not in this form then we need to re-order them.
+  const TIn *kernel_hwio = kernel;
+  if (kernel_shape.ordering != HWIO)
+  {
+    kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
+
+    // Re-order the weights from OIHW to HWIO
+    this->prof(
+      "Weight reorder",
+      [&kernel, &kernel_hwio, this] () {
+        reorder::ofm_ifm_h_w_to_h_w_ifm_ofm(
+          kernel, const_cast<TIn *>(kernel_hwio),
+          kernel_shape.n_output_channels,
+          kernel_shape.n_input_channels,
+          kernel_shape.n_rows,
+          kernel_shape.n_cols
+        );
+      },
+      kernel_shape.size() * sizeof(TIn),
+      0,
+      kernel_shape.size() * sizeof(TIn)
+    );
+  }
+
+  const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
+  WeightsTransformT weights_transform(
+    kernel_hwio, kernel_matrices[0],
+    kernel_matrix_size_bytes / sizeof(TIn),
+    kernel_matrix_row_stride,
+    kernel_shape.n_output_channels,
+    kernel_shape.n_input_channels
+  );
+
+  // Transform the weights into the Winograd domain
+  auto kernel_prep = [&] ()
+  {
+    weights_transform.run(0, weights_transform.get_window());
+  };
+
+  prof(
+    "Kernel Prep", kernel_prep,
+    WeightsTransformT::bytes_read(kernel_shape),
+    WeightsTransformT::ops_performed(kernel_shape),
+    WeightsTransformT::bytes_written(kernel_shape)
+  );
+
+  // Free memory if we allocated it
+  if (allocated_working_space)
+  {
+    free(transform_working_space);
+  }
+}
+
+
+/** Perform a convolution. */
+template <int output_tile_rows, int output_tile_cols,
+          int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
+Convolution<TOut, TIn>::execute(
+  TOut* const output,
+  const TIn* const input,
+  void *working_space,
+  const int n_threads
+)
+{
+  const auto padding_type = padding;
+  const auto input_shape = this->input_shape;
+
+  // Allocate working space if none has been provided
+  const bool manage_working_space = (working_space == NULL);
+  if (manage_working_space)
+  {
+    const size_t ws_size = get_working_space_size(
+      kernel_shape, input_shape, padding_type
+    );
+    working_space = ALLOCATE(ws_size * sizeof(int8_t));
+    memset(working_space, 0x00, ws_size);
+  }
+  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
+
+  // Split the working space into that required for 16 input matrices and
+  // output matrices.
+  TIn *input_matrices[N_GEMMS];
+  TOut *output_matrices[N_GEMMS];
+  const int in_matrix_stride_bytes = get_input_matrix_size(kernel_shape, input_shape, padding_type);
+  const int out_matrix_stride_bytes = get_output_matrix_size(kernel_shape, input_shape, padding_type);
+
+  for (int i = 0; i < N_GEMMS; i++)
+  {
+    input_matrices[i] = reinterpret_cast<TIn *>(
+        ws_bytes + i*in_matrix_stride_bytes);
+    output_matrices[i] = reinterpret_cast<TIn *>(
+        ws_bytes + N_GEMMS*in_matrix_stride_bytes + i*out_matrix_stride_bytes);
+  }
+
+  // If we need to re-order the input and output tensors then the final chunk
+  // of the working space can be used for this purpose.
+  // TODO  - Overlay the input reorder on top of the output matrices
+  //       - Overlay the output reorder on top of the input matrices
+  // Reorder the input input form if it was not provided in this ordering.
+  const TIn* input_nhwc = input;
+  if (input_shape.ordering == NCHW)
+  {
+    input_nhwc = reinterpret_cast<TIn *>(
+      ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
+    );
+
+    this->prof(
+      "NCHW -> NHWC",
+      [input, input_shape, input_nhwc] () {
+        reorder::nchw_to_nhwc(
+          input, const_cast<TIn *>(input_nhwc),
+          input_shape.n_batches,
+          input_shape.n_channels,
+          input_shape.n_rows,
+          input_shape.n_cols
+        );
+      },
+      input_shape.size(), 0, input_shape.size()
+    );
+  }
+
+  // Compute shape for the GEMM
+  const auto output_shape = this->output_shape;
+  int M = this->M;
+  int K = this->K;
+  int N = this->N;
+
+  const int in_matrix_row_stride = K;
+  const int out_matrix_row_stride = kernel_matrix_row_stride;
+
+  InputTransform<TIn> input_transform(
+    input_nhwc,
+    input_shape.n_batches,
+    input_shape.n_rows,
+    input_shape.n_cols,
+    input_shape.n_channels,
+    padding_type,
+    input_matrices[0],
+    in_matrix_stride_bytes / sizeof(TIn),
+    in_matrix_row_stride
+  );
+
+  // Transform the input into the Winograd domain
+  auto input_prep = [&] () {
+    input_transform.run(0, input_transform.get_window());
+  };
+  prof(
+    "Input Prep", input_prep,
+    InputTransform<TIn>::bytes_read(input_shape),
+    InputTransform<TIn>::ops_performed(input_shape),
+    InputTransform<TIn>::bytes_written(input_shape)
+  );
+
+  // Perform the GEMMs
+  const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape);
+  BatchedBlockedGemm<M_BLOCK, N_BLOCK, TOut, TIn> gemms(
+    N_GEMMS, M, K, N,
+    in_matrix_stride_bytes / sizeof(TIn),
+    in_matrix_row_stride,
+    kernel_matrix_stride_bytes / sizeof(TIn),
+    kernel_matrix_row_stride,
+    out_matrix_stride_bytes / sizeof(TOut),
+    out_matrix_row_stride,
+    input_matrices[0],
+    kernel_matrices[0],
+    output_matrices[0]
+  );
+  gemms.run(0, gemms.get_window());
+
+  // If the output tensor needs to be in NCHW form then store the NHWC output
+  // tensor in temporary storage and then reorder. If the output tensor needs
+  // to be in NHWC then just write straight to the output tensor.
+  TOut *output_nhwc = output;
+  if (input_shape.ordering == NCHW)
+  {
+    output_nhwc = reinterpret_cast<TOut *>(
+      ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
+    );
+  }
+
+  // Transform the output tensor from the Winograd domain to the spatial
+  // domain.
+  OutputTransform<TOut> output_transform(
+    output_matrices[0],
+    out_matrix_stride_bytes / sizeof(TOut),
+    out_matrix_row_stride,
+    output_nhwc,
+    output_shape.n_batches,
+    output_shape.n_rows,
+    output_shape.n_cols,
+    output_shape.n_channels
+  );
+  auto output_prep = [&] () {
+    output_transform.run(0, output_transform.get_window());
+  };
+  prof(
+    "Output Comp", output_prep,
+    OutputTransform<TOut>::bytes_read(output_shape),
+    OutputTransform<TOut>::ops_performed(output_shape),
+    OutputTransform<TOut>::bytes_written(output_shape)
+  );
+
+  // Reorder the output tensor if it is required to be in NCHW form.
+  if (input_shape.ordering == NCHW)
+  {
+    prof(
+      "NHWC -> NCHW",
+      [output_nhwc, output_shape, output] () {
+        reorder::nhwc_to_nchw(
+          output_nhwc, output,
+          output_shape.n_batches,
+          output_shape.n_rows,
+          output_shape.n_cols,
+          output_shape.n_channels
+        );
+      },
+      output_shape.size(), 0, output_shape.size()
+    );
+  }
+
+  // Free working space if we were responsible for allocating it
+  if (manage_working_space)
+  {
+    free(working_space);
+  }
+}
+
+
+/** Perform a convolution. */
+template <int output_tile_rows, int output_tile_cols,
+          int kernel_rows, int kernel_cols>
+template <typename TOut, typename TIn>
+void WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::
+Convolution<TOut, TIn>::execute(
+  TOut* const output,
+  const TIn* const input,
+  const int n_threads
+)
+{
+  execute(output, input, NULL, n_threads);
+}
+
+
+// Instantiate required implementations
+template class WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>;
+template class WinogradGEMM<4, 4, 3, 3>::Convolution<float, float>;

diff --git a/src/core/NEON/kernels/winograd/winograd_gemm.hpp b/src/core/NEON/kernels/winograd/winograd_gemm.hpp
deleted file mode 100644
index 1ca3e31..0000000
--- a/src/core/NEON/kernels/winograd/winograd_gemm.hpp
+++ /dev/null

@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <cstdint>
-#include <cstdlib>
-#include <cassert>
-
-#include "gemm.hpp"
-#include "profiler.hpp"
-#include "utils.hpp"
-#include "shims.hpp"
-
-#include "transforms.hpp"
-
-namespace winograd {
-  /***************************************************************************/
-  /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM
-   * internally.
-   */
-  template <typename TOut, typename TIn>
-  class Winograd2x2_3x3GEMM {
-    public:
-      /* Instantiate a new Winograd operator.
-       */
-      Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
-      virtual ~Winograd2x2_3x3GEMM();
-
-      /** Transform the weights into the Winograd domain.
-       */
-      template <typename KernelTransform=winograd2x2_3x3_gemm_kernel_transform_impl<TIn>>
-      void transform_weights(const TIn* const kernel, void *transform_working_space);
-
-      /* Initializes matrices pointers, to be called once before execute()
-       */
-      template <typename InputTransform=Winograd2x2_3x3GemmInputChannelwise<TIn>>
-      void reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const TIn* const input, void* working_space);
-
-      /* Apply the Winograd operator to some input.
-       */
-      template <typename OutputTransform=Winograd2x2_3x3GemmOutput<TOut>>
-      void reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output);
-
-
-      /* Apply the Winograd operator to some input.
-       */
-      void execute(size_t first, size_t last);
-
-      /* Get the memory required to transform the kernel.
-       */
-      static inline size_t get_kernel_transform_working_size(const KernelShape &shape);
-
-      /* Get the output shape of a convolution.
-       */
-      static Tensor4DShape get_output_shape(const Tensor4DShape &input_shape, const KernelShape &k_shape,
-                                     const PaddingType padding_type);
-
-      /* Get the memory required to instantiate a new Winograd operator.
-       */
-      static size_t get_kernel_storage_size(const KernelShape &shape);
-
-      /* Get the memory required to apply a Winograd operator to some input.
-       */
-      static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape,
-                                    const PaddingType padding);
-
-
-      Winograd2x2_3x3GEMM(const Winograd2x2_3x3GEMM &) = delete;
-      /** Prevent instances of this class from being copied (As this class contains pointers) */
-      Winograd2x2_3x3GEMM &operator=(const Winograd2x2_3x3GEMM &) = delete;
-      /** Allow instances of this class to be moved */
-      Winograd2x2_3x3GEMM(Winograd2x2_3x3GEMM &&) = default;
-      /** Allow instances of this class to be moved */
-      Winograd2x2_3x3GEMM &operator=(Winograd2x2_3x3GEMM &&) = default;
-
-    protected:
-      /* Get the memory required by a single "input" matrix.
-       */
-      static size_t get_input_matrix_size(const Tensor4DShape &input_shape,const KernelShape &k_shape,
-                                   const PaddingType padding);
-
-      /* Get the memory required by a single "output" matrix.
-       */
-      static size_t get_output_matrix_size(const Tensor4DShape &input_shape, const KernelShape &k_shape,
-                                    const PaddingType padding);
-
-      /* Get the memory required by a single "kernel" matrix.
-       */
-      static size_t get_kernel_matrix_size(const KernelShape &shape);
-
-      const KernelShape kernel_shape;  // Shape of applied kernel
-      const Tensor4DShape in_shape;
-      const PaddingType padding;
-
-      const int kernel_matrix_row_stride;  // Stride within kernel matrix
-
-      const bool manage_kernel_storage;  // Free kernel storage when done
-      void* const _kernel_storage;  // Base pointer for kernel matrices
-
-      profiler prof;  // Profiler
-
-      TIn *kernel_matrices[16];  // Prepared form of kernel
-      TIn *input_matrices[16];
-      TOut *output_matrices[16];
-
-
-      static const int M_BLOCK = 4;
-      static const int N_BLOCK = 16;
-  };
-} // namespace winograd
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_transform_working_size(
-    const KernelShape &shape
-)
-{
-    // Need to re-order the kernel into HWIO form, require enough space to
-    // represent the tensor.
-    return sizeof(TIn) * shape.size();
-}
-
-
-template <typename TOut, typename TIn>
-template <typename KernelTransform>
-void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::transform_weights(
-  const TIn* const kernel,
-  void *transform_working_space
-)
-{
-    const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
-    int8_t* const ks_bytes = reinterpret_cast<int8_t *>(_kernel_storage);
-    for (int i = 0; i < 16; i++) {
-        kernel_matrices[i] = reinterpret_cast<TIn *>(
-        ks_bytes + i*kernel_matrix_size_bytes);
-    }
-
-    const TIn *kernel_hwio = kernel;
-    if( transform_working_space)
-    {
-            kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
-            ofm_ifm_h_w_to_h_w_ifm_ofm(
-                  kernel, const_cast<TIn *>(kernel_hwio),
-                  kernel_shape.n_output_channels,
-                  kernel_shape.n_input_channels,
-                  kernel_shape.n_rows,
-                  kernel_shape.n_cols
-                );
-    }
-    KernelTransform::execute(
-      kernel_shape, kernel_hwio, kernel_matrices[0],
-      kernel_matrix_size_bytes / sizeof(TIn),
-      kernel_matrix_row_stride
-    );
-}
-
-template <typename TOut, typename TIn>
-winograd::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM( const KernelShape &kernel_shape, const Tensor4DShape input_shape,
-        const PaddingType padding_type, void *kernel_storage)
-    : kernel_shape(kernel_shape), in_shape(input_shape), padding(padding_type),kernel_matrix_row_stride(roundup(kernel_shape.n_output_channels, N_BLOCK)), manage_kernel_storage(false),
-        _kernel_storage(kernel_storage), prof() {
-     memset(kernel_matrices, 0x00, sizeof(TIn)*16);
-     memset(input_matrices, 0x00, sizeof(TIn)*16);
-     memset(output_matrices, 0x00, sizeof(TOut)*16);
-}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-winograd::Winograd2x2_3x3GEMM<TOut, TIn>::~Winograd2x2_3x3GEMM() {}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-template <typename InputTransform>
-void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_input(
-    const Tensor4DShape& input_shape,
-    const PaddingType padding_type,
-    const TIn* const input,
-    void *working_space
-) {
-  assert(working_space);
-  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-  // Split the working space into that required for 16 input matrices and
-  // output matrices.
-  const int in_matrix_stride_bytes = get_input_matrix_size(input_shape, kernel_shape, padding_type);
-  const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type);
-
-  for (int i = 0; i < 16; i++) {
-    input_matrices[i] = reinterpret_cast<TIn *>(
-        ws_bytes + i*in_matrix_stride_bytes);
-    output_matrices[i] = reinterpret_cast<TIn *>(
-        ws_bytes + 16*in_matrix_stride_bytes + i*out_matrix_stride_bytes);
-  }
-
-  // Compute shape for the GEMM
-  const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type);
-  const int tile_rows = iceildiv(output_shape.n_rows, 2);
-  const int tile_cols = iceildiv(output_shape.n_cols, 2);
-  const int K = kernel_shape.n_input_channels;
-
-  const int in_matrix_row_stride = K;
-  const int in_matrix_batch_stride = tile_rows*tile_cols*in_matrix_row_stride;
-
-  // Transform the input tensor into an appropriate form
-  auto input_prep = [&] () {
-    InputTransform::execute(
-      input, input_shape, padding_type, tile_rows, tile_cols,
-      input_matrices[0], in_matrix_stride_bytes / sizeof(TIn),
-      in_matrix_batch_stride, in_matrix_row_stride
-    );
-  };
-  prof(
-    "Input Prep", input_prep,
-    InputTransform::bytes_read(input_shape, output_shape),
-    InputTransform::flops_performed(input_shape, output_shape),
-    InputTransform::bytes_written(input_shape, output_shape)
-  );
-
-}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-template <typename OutputTransform>
-void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::reshape_output(const Tensor4DShape& input_shape, const PaddingType padding_type, TOut* const output) {
-  assert(output_matrices[0]);
-  const int out_matrix_stride_bytes = get_output_matrix_size(input_shape, kernel_shape, padding_type);
-  const auto output_shape = get_output_shape(input_shape,kernel_shape, padding_type);
-  const int out_matrix_row_stride = kernel_matrix_row_stride;
-
-  // Transform the output tensor into an appropriate form
-    OutputTransform::execute(
-      output_shape,
-      output_matrices[0],
-      out_matrix_stride_bytes / sizeof(TOut),
-      out_matrix_row_stride,
-      output
-    );
-}
-
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-void winograd::Winograd2x2_3x3GEMM<TOut, TIn>::execute( size_t first, size_t last ) {
-  assert(input_matrices[0] && kernel_matrices[0] && output_matrices[0]);
-  assert(first < 16 && last < 16 && first < last);
-  // Compute shape for the GEMM
-  const auto output_shape = get_output_shape(in_shape,kernel_shape, padding);
-  const int tile_rows = iceildiv(output_shape.n_rows, 2);
-  const int tile_cols = iceildiv(output_shape.n_cols, 2);
-  const int M = in_shape.n_batches * tile_rows * tile_cols;
-  const int K = kernel_shape.n_input_channels;
-  const int N = kernel_shape.n_output_channels;
-
-  const int in_matrix_row_stride = K;
-  const int out_matrix_row_stride = kernel_matrix_row_stride;
-  // Perform the GEMMs
-  for (size_t i = first; i <= last; i++) {
-      BlockedGemm<M_BLOCK, N_BLOCK>(
-        input_matrices[i], kernel_matrices[i], output_matrices[i], M, K, N,
-        in_matrix_row_stride, kernel_matrix_row_stride, out_matrix_row_stride
-      );
-  }
-
-}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-Tensor4DShape winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(
-    const Tensor4DShape &in_shape, const KernelShape &k_shape, const PaddingType padding)  {
-  return Tensor4DShape {
-    in_shape.n_batches,
-    (padding == PADDING_SAME) ? in_shape.n_rows : in_shape.n_rows - 2,
-    (padding == PADDING_SAME) ? in_shape.n_cols : in_shape.n_cols - 2,
-    k_shape.n_output_channels
-  };
-}
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_storage_size(
-    const KernelShape &shape) {
-  return 16 * get_kernel_matrix_size(shape);
-}
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_kernel_matrix_size(
-    const KernelShape &shape) {
-  const int K = shape.n_input_channels;
-  const int N = roundup(shape.n_output_channels, N_BLOCK);
-  return sizeof(TIn) * K * N;
-}
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
-    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
-)  {
-  return 16 * get_input_matrix_size(input_shape, k_shape, padding_type) +
-         16 * get_output_matrix_size(input_shape, k_shape, padding_type);
-}
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(
-    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
-)  {
-  // Compute shape for the GEMM
-  const auto output_shape = get_output_shape(input_shape, k_shape, padding_type);
-  const int tile_rows = iceildiv(output_shape.n_rows, 2);
-  const int tile_cols = iceildiv(output_shape.n_cols, 2);
-  const int M = roundup(tile_rows * tile_cols, M_BLOCK);
-  const int K = k_shape.n_input_channels;
-
-  return input_shape.n_batches * M * K * sizeof(TIn);
-}
-
-template <typename TOut, typename TIn>
-size_t winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(
-    const Tensor4DShape& input_shape, const KernelShape &k_shape,const PaddingType padding_type
-)  {
-  // Compute shape for the GEMM
-  const auto output_shape = get_output_shape(input_shape, k_shape, padding_type);
-  const int tile_rows = iceildiv(output_shape.n_rows, 2);
-  const int tile_cols = iceildiv(output_shape.n_cols, 2);
-  const int M = roundup(tile_rows * tile_cols, M_BLOCK);
-  const int N = roundup(k_shape.n_output_channels, N_BLOCK);
-
-  return input_shape.n_batches * M * N * sizeof(TOut);
-}

diff --git a/src/core/NEON/kernels/winograd/winograd_layer.cpp b/src/core/NEON/kernels/winograd/winograd_layer.cpp
new file mode 100644
index 0000000..689ecba
--- /dev/null
+++ b/src/core/NEON/kernels/winograd/winograd_layer.cpp

@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "convolution.hpp"
+#include "winograd_layer.hpp"
+#include "tensor.hpp"
+
+
+/** Determine how much memory (in units of TIn) to allocate for the transformed
+ * weights.
+ */
+template <
+  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
+  typename TIn, typename TOut
+>
+unsigned int WinogradConvolutionLayer<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
+>::get_weight_storage_size(
+  const int n_output_channels,  /** Number of output feature maps. */
+  const int n_input_channels    /** Number of input feature maps. */
+)
+{
+  const KernelShape shape(
+    n_output_channels, KernelRows, KernelCols, n_input_channels
+  );
+  return static_cast<unsigned int>(
+    // WinogradConv returns the size in bytes, we divide by `sizeof(TIn)` to
+    // express that in units of TIn.
+    WinogradConv::get_kernel_storage_size(shape) / sizeof(TIn)
+  );
+}
+
+
+/** Determine how much memory (in units of TIn) to allocate for the transformed
+ * input.
+ */
+template <
+  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
+  typename TIn, typename TOut
+>
+unsigned int WinogradConvolutionLayer<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
+>::get_input_storage_size(
+  const int n_batches,     /** Number of batches in the input tensor. */
+  const int n_channels,    /** Number of feature maps in the input tensor. */
+  const int n_rows,        /** Number of rows in each feature map. */
+  const int n_cols,        /** Number of columns in each feature map. */
+  const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
+)
+{
+  // Construct shapes for the input and kernel tensors.
+  const Tensor4DShape input_shape(n_batches, n_rows, n_cols, n_channels);
+  const KernelShape kern_shape(1, KernelRows, KernelCols, n_channels);
+  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+  // Return the size, converted into units of TIn
+  return static_cast<unsigned int>(
+    WinogradConv::get_input_storage_size(kern_shape, input_shape, padding) /
+    sizeof(TIn)
+  );
+}
+
+
+/** Determine how much memory (in units of TOut) to allocate for the (Winograd
+ * domain) output.
+ */
+template <
+  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
+  typename TIn, typename TOut
+>
+unsigned int WinogradConvolutionLayer<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
+>::get_output_storage_size(
+  const int n_batches,          /** Number of batches in the output tensor. */
+  const int n_rows,             /** Number of rows in each feature map of the input tensor. */
+  const int n_cols,             /** Number of columns in each feature map of the input tensor. */
+  const int n_output_channels,  /** Number of feature maps in the output tensor. */
+  const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
+)
+{
+  // Construct shapes for the input and kernel tensors.
+  const Tensor4DShape input_shape(n_batches, n_rows, n_cols, 1);
+  const KernelShape kern_shape(n_output_channels, KernelRows, KernelCols, 1);
+  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+  // Return the size, converted into units of TOut
+  return static_cast<unsigned int>(
+    WinogradConv::get_output_storage_size(kern_shape, input_shape, padding) /
+    sizeof(TOut)
+  );
+}
+
+
+/** Get the shape (rows, cols) of a feature map of the output tensor. */
+template <
+  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
+  typename TIn, typename TOut
+>
+std::pair<int, int> WinogradConvolutionLayer<
+  OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut
+>::get_output_feature_map_shape(
+  const int n_input_rows,  /** Number of rows in the input feature map. */
+  const int n_input_cols,  /** Number of columns in the input feature map. */
+  const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
+)
+{
+  // Construct shapes for the input and kernel tensors.
+  const Tensor4DShape input_shape(1, n_input_rows, n_input_cols, 1);
+  const KernelShape kern_shape(1, KernelRows, KernelCols, 1);
+  const PaddingType padding = (same_padding) ? PADDING_SAME : PADDING_VALID;
+
+  // Compute the new shape
+  const auto output_shape = WinogradConv::get_output_shape(
+    kern_shape, input_shape, padding
+  );
+
+  return std::make_pair(output_shape.n_rows, output_shape.n_cols);
+}
+
+
+/** Create a new Winograd convolution layer.
+ */
+template <
+  int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
+  typename TIn, typename TOut
+>
+WinogradConvolutionLayer<OutputTileRows, OutputTileCols, KernelRows, KernelCols, TIn, TOut>::
+WinogradConvolutionLayer(
+  const int n_batches,          /** Number of batches in the input and output tensors. */
+  const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
+  const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
+  const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
+  const int n_output_channels,  /** Number of feature maps in the output tensor. */
+  const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
+  const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
+  TIn* const winograd_weights,  /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
+  const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
+  TIn* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
+  TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
+  TOut* const winograd_output   /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
+) : _kernel_shape(n_output_channels, KernelRows, KernelCols, n_input_channels),
+    _input_shape(n_batches, n_input_rows, n_input_cols, n_input_channels),
+    _padding(same_padding ? PADDING_SAME : PADDING_VALID),
+    _output_shape(WinogradConv::get_output_shape(_kernel_shape, _input_shape, _padding)),
+    _n_output_rows(_output_shape.n_rows),
+    _n_output_cols(_output_shape.n_cols),
+    _kernel_matrix_stride(WinogradConv::get_kernel_matrix_stride(_kernel_shape)),
+    _kernel_matrix_row_stride(roundup(n_output_channels, WinogradConv::N_BLOCK)),
+    _input_matrix_stride(WinogradConv::get_input_matrix_stride(_kernel_shape, _input_shape, _padding)),
+    _input_matrix_row_stride(n_input_channels),
+    _output_matrix_stride(WinogradConv::get_output_matrix_stride(_kernel_shape, _input_shape, _padding)),
+    _output_matrix_row_stride(_kernel_matrix_row_stride),
+    _tile_rows(iceildiv(_n_output_rows, OutputTileRows)),
+    _tile_cols(iceildiv(_n_output_cols, OutputTileCols)),
+    _m(n_batches * _tile_rows * _tile_cols),
+    _k(n_input_channels),
+    _n(n_output_channels),
+    weights_transform(
+      weights, winograd_weights,
+      _kernel_matrix_stride, _kernel_matrix_row_stride,
+      n_output_channels, n_input_channels
+    ),
+    input_transform(
+      input, n_batches, n_input_rows, n_input_cols, n_input_channels, _padding,
+      winograd_input, _input_matrix_stride, _input_matrix_row_stride
+    ),
+    gemms(
+      WinogradBase::N_GEMMS, _m, _k, _n,
+      _input_matrix_stride, _input_matrix_row_stride,
+      _kernel_matrix_stride, _kernel_matrix_row_stride,
+      _output_matrix_stride, _output_matrix_row_stride,
+      winograd_input, winograd_weights, winograd_output
+    ),
+    output_transform(
+      winograd_output, _output_matrix_stride, _output_matrix_row_stride,
+      output, n_batches, _n_output_rows, _n_output_cols, n_output_channels
+    )
+{
+}
+
+// Instantiate valid implementations.
+template class WinogradConvolutionLayer<2, 2, 3, 3, float, float>;
+template class WinogradConvolutionLayer<4, 4, 3, 3, float, float>;

diff --git a/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp b/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp
deleted file mode 100644
index de201fe..0000000
--- a/src/core/NEON/kernels/winograd/winograd_shim_nchw.hpp
+++ /dev/null

@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-#include <cstdint>
-#include <cstdlib>
-
-#include "gemm.hpp"
-#include "profiler.hpp"
-#include "utils.hpp"
-#include "shims.hpp"
-#include "winograd_gemm.hpp"
-
-#include "transforms.hpp"
- 
-#ifndef ALLOC_ALIGN
-#define ALLOC_ALIGN 64
-#endif  // ALLOC_ALIGN
-
-
-namespace winograd_shim_nchw {
-  /***************************************************************************/
-  /* Implementation of the Winograd F(2x2, 3x3, 4x4) algorithm using GEMM
-   * internally.
-   */
-  template <typename TOut, typename TIn>
-  class Winograd2x2_3x3GEMM : public winograd::Winograd2x2_3x3GEMM<TOut, TIn> {
-    public:
-      /* Instantiate a new Winograd operator.
-       */
-      Winograd2x2_3x3GEMM(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage);
-
-      void nchw2nhwc( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input);
-      void nhwc2nchw( const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, TOut* const output);
-
-
-      std::pair<TOut*,TIn*> get_nhwc_ptrs(const Tensor4DShape& input_shape,const PaddingType padding_type,void *working_space);
-
-      static size_t get_working_space_size(const Tensor4DShape &input_shape,const KernelShape &k_shape, const PaddingType padding);
-    protected:
-      /* Get the memory required to store an NHWC copy of the input tensor. */
-      static size_t get_working_nhwc_input_size(const Tensor4DShape &input_shape);
-
-      /* Get the memory required to store an NHWC copy of the input tensor. */
-      static size_t get_working_nhwc_output_size(const Tensor4DShape &output_shape, const KernelShape &k_shape, const PaddingType padding) ;
-  };
-} // namespace winograd
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::Winograd2x2_3x3GEMM(
-    const KernelShape &kernel_shape, const Tensor4DShape input_shape,
-        const PaddingType padding_type, void *kernel_storage
-) : winograd::Winograd2x2_3x3GEMM<TOut, TIn>(kernel_shape,input_shape,padding_type,kernel_storage) {
-}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nchw2nhwc(const Tensor4DShape& input_shape, const PaddingType padding_type, void *working_space, const TIn* const input) {
-  assert(working_space);
-  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-
-  // Extract the top chunk of the working space to store the input and output
-  // tensors in NHWC format.
-  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
-  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
-
-  // Allocate working space for the input and output in NHWC format
-  TIn* const input_nhwc = reinterpret_cast<TIn *>(
-      ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes)
-  );
-
-  // Re-order the input tensor
-  this->prof(
-    "NCHW -> NHWC",
-    [input, input_shape, input_nhwc] () {
-      nchw_to_nhwc(
-        input, input_nhwc,
-        input_shape.n_batches,
-        input_shape.n_channels,
-        input_shape.n_rows,
-        input_shape.n_cols
-      );
-    },
-    input_shape.size(), 0, input_shape.size()
-  );
-}
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-void winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::nhwc2nchw(const Tensor4DShape& input_shape, const PaddingType padding_type, 
-            void *working_space, TOut* const output) {
-
-  assert(working_space);
-  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-
-  // Extract the top chunk of the working space to store the input and output
-  // tensors in NHWC format.
-  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
-  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
-
-  TOut* const output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
-
-  // Re-order the output tensor into NCHW
-  const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape, this->kernel_shape, padding_type);
-  this->prof(
-    "NHWC -> NCHW",
-    [output_nhwc, output_shape, output] () {
-      nhwc_to_nchw(
-        output_nhwc, output,
-        output_shape.n_batches,
-        output_shape.n_rows,
-        output_shape.n_cols,
-        output_shape.n_channels
-      );
-    },
-    output_shape.size(), 0, output_shape.size()
-  );
-}
-
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-std::pair<TOut*,TIn*> winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_nhwc_ptrs(
-    const Tensor4DShape& input_shape,
-    const PaddingType padding_type,
-    void *working_space
-) {
-  assert(working_space);
-  int8_t* const ws_bytes = reinterpret_cast<int8_t *>(working_space);
-
-  // Extract the top chunk of the working space to store the input and output
-  // tensors in NHWC format.
-  const int in_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_input_matrix_size(input_shape, this->kernel_shape, padding_type);
-  const int out_matrix_stride_bytes = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_matrix_size(input_shape, this->kernel_shape, padding_type);
-
-  // Allocate working space for the input and output in NHWC format
-  TIn* input_nhwc = reinterpret_cast<TIn *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes));
-  TOut* output_nhwc = reinterpret_cast<TOut *>(ws_bytes + 16*(in_matrix_stride_bytes + out_matrix_stride_bytes) + get_working_nhwc_input_size(input_shape));
-  return std::make_pair(output_nhwc,input_nhwc);
-}
-
-
-
-
-/*****************************************************************************/
-template <typename TOut, typename TIn>
-size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
-    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
-)  {
-  return winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_space_size(
-      input_shape, k_shape, padding_type)
-      + get_working_nhwc_input_size(input_shape)
-      + get_working_nhwc_output_size(input_shape, k_shape, padding_type);
-}
-
-template <typename TOut, typename TIn>
-size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_input_size(
-    const Tensor4DShape& input_shape
-)  {
-  return roundup(input_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
-}
-
-template <typename TOut, typename TIn>
-size_t winograd_shim_nchw::Winograd2x2_3x3GEMM<TOut, TIn>::get_working_nhwc_output_size(
-    const Tensor4DShape& input_shape, const KernelShape &k_shape, const PaddingType padding_type
-)  {
-  const auto output_shape = winograd::Winograd2x2_3x3GEMM<TOut, TIn>::get_output_shape(input_shape,k_shape, padding_type);
-  return roundup(output_shape.size() * sizeof(TIn), static_cast<size_t>(ALLOC_ALIGN));
-}

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 8acd71c..0150a95 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,25 +30,55 @@
 
 using namespace arm_compute;
 
+namespace
+{
+/** Extends parent shape depending on subtensor's coordinates and shape
+ *
+ * @param parent_shape Parent shape
+ * @param shape        Subtensor shape
+ * @param coords       Subtensor coordinates inside parent tensor
+ *
+ * @return Extended parent shape
+ */
+TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
+{
+    // Subtensor should not index in x, y dimensions.
+    ARM_COMPUTE_ERROR_ON((coords.x() != 0) || (coords.y() != 0));
+
+    // Cannot extend on x, y ?
+    ARM_COMPUTE_ERROR_ON((parent_shape.total_size() != 0) && (parent_shape.x() != shape.x()) && (parent_shape.y() != shape.y()));
+
+    // Extend shape
+    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    {
+        int dimension_extend = coords[i] + static_cast<int>(shape[i]);
+        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        {
+            parent_shape.set(i, static_cast<size_t>(dimension_extend));
+        }
+    }
+
+    return parent_shape;
+}
+} // namespace
+
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }
+    : _parent(nullptr), _tensor_shape(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false)
 {
 }
 
-SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords)
-    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }
+SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
+    : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0)
+    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
 
     // Initialize valid region
-    Coordinates coordinates;
-    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
-    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
 }
 
 std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -65,11 +95,19 @@
 ITensorInfo &SubTensorInfo::set_tensor_shape(TensorShape shape)
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0)
+    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
     }
+    else if(_extend_parent) // Extend parent shape, configure if specified
+    {
+        ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
+        TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
+        _parent->set_tensor_shape(parent_extended_shape);
+        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+    }
     _tensor_shape = shape;
     return *this;
 }
@@ -78,6 +116,7 @@
 {
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
     ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
+    ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Extend parent padding if required
     return _parent->extend_padding(padding);

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 60e76bf..24988e2 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -159,9 +159,7 @@
     _strides_in_bytes              = strides_in_bytes;
     _total_size                    = total_size_in_bytes;
 
-    Coordinates coordinates;
-    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
-    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
 }
 
 void TensorInfo::init(const HOGInfo &hog_info, unsigned int width, unsigned int height)
@@ -201,9 +199,7 @@
     _format               = Format::UNKNOWN;
     _tensor_shape         = tensor_shape;
 
-    Coordinates coordinates;
-    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
-    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
 
     auto_padding();
 
@@ -368,9 +364,9 @@
         _total_size                           = _tensor_shape[idx_last_dimension] * _strides_in_bytes[idx_last_dimension];
     }
 
-    Coordinates coordinates;
-    coordinates.set_num_dimensions(_tensor_shape.num_dimensions());
-    _valid_region = ValidRegion{ coordinates, _tensor_shape };
+    std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+
+    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
     return *this;
 }
 

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 76d0b0f..83a843d 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -261,29 +261,17 @@
 
 const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_dimensions(
     unsigned int in_width, unsigned int in_height, unsigned int kernel_width, unsigned int kernel_height, unsigned int padx, unsigned int pady,
-    unsigned int ax, unsigned int ay, float upscalex, float upscaley, DimensionRoundingType round)
+    unsigned int inner_border_right, unsigned int inner_border_top, unsigned int stride_x, unsigned int stride_y)
 {
     ARM_COMPUTE_ERROR_ON(in_width < 1 || in_height < 1);
-    ARM_COMPUTE_ERROR_ON(((in_width - 1) * upscalex + kernel_width + ax) < 2.f * padx);
-    ARM_COMPUTE_ERROR_ON(((in_height - 1) * upscaley + kernel_height + ay) < 2.f * pady);
-    const float fw = (in_width - 1) * upscalex - 2.f * padx + kernel_width + ax;
-    const float fh = (in_height - 1) * upscaley - 2.f * pady + kernel_height + ay;
-    int         w  = 0;
-    int         h  = 0;
-    switch(round)
-    {
-        case DimensionRoundingType::FLOOR:
-            w = std::floor(fw);
-            h = std::floor(fh);
-            break;
-        case DimensionRoundingType::CEIL:
-            w = std::ceil(fw);
-            h = std::ceil(fh);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
+    ARM_COMPUTE_ERROR_ON(((in_width - 1) * stride_x + kernel_width + inner_border_right) < 2 * padx);
+    ARM_COMPUTE_ERROR_ON(((in_height - 1) * stride_y + kernel_height + inner_border_top) < 2 * pady);
+    const int padx_deconv = (kernel_width - padx - 1);
+    const int pady_deconv = (kernel_height - pady - 1);
+    ARM_COMPUTE_ERROR_ON(padx_deconv < 0);
+    ARM_COMPUTE_ERROR_ON(pady_deconv < 0);
+    const int w = stride_x * (in_width - 1) + kernel_width + inner_border_right - 2 * padx_deconv;
+    const int h = stride_y * (in_height - 1) + kernel_height + inner_border_top - 2 * pady_deconv;
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
@@ -332,6 +320,7 @@
 {
     switch(dt)
     {
+        case DataType::QASYMM8:
         case DataType::U8:
             print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
             break;
@@ -367,6 +356,7 @@
 {
     switch(dt)
     {
+        case DataType::QASYMM8:
         case DataType::U8:
             return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
         case DataType::QS8:

diff --git a/src/graph/SubGraph.cpp b/src/graph/SubGraph.cpp
index e975421..8ba2af6 100644
--- a/src/graph/SubGraph.cpp
+++ b/src/graph/SubGraph.cpp

@@ -52,12 +52,12 @@
     }
 }
 
-std::unique_ptr<Graph> SubGraph::construct(TargetHint hint, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output)
+std::unique_ptr<Graph> SubGraph::construct(const GraphContext &ctx, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output)
 {
     auto graph = arm_compute::support::cpp14::make_unique<Graph>();
 
     // Set hint
-    graph->hints().set_target_hint(hint);
+    graph->hints() = ctx.hints();
 
     // Configure input
     if(_input == nullptr)

diff --git a/src/graph/SubTensor.cpp b/src/graph/SubTensor.cpp
index 2edeb3b..2e640dd 100644
--- a/src/graph/SubTensor.cpp
+++ b/src/graph/SubTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,21 +37,21 @@
 namespace
 {
 template <typename SubTensorType, typename ParentTensorType>
-std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords)
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
     auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
-    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+    auto subtensor = arm_compute::support::cpp14::make_unique<SubTensorType>(ptensor, shape, coords, extend_parent);
     return std::move(subtensor);
 }
 } // namespace
 
 SubTensor::SubTensor()
-    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(), _coords(), _parent(nullptr), _subtensor(nullptr), _extend_parent(false)
 {
 }
 
-SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords)
-    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr)
+SubTensor::SubTensor(Tensor &parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
+    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr), _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent.tensor() == nullptr);
     _parent = parent.tensor();
@@ -60,8 +60,8 @@
     instantiate_subtensor();
 }
 
-SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target)
-    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr)
+SubTensor::SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target, bool extend_parent)
+    : _target(target), _tensor_shape(tensor_shape), _coords(coords), _parent(parent), _subtensor(nullptr), _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
     instantiate_subtensor();
@@ -108,10 +108,10 @@
     switch(_target)
     {
         case TargetHint::OPENCL:
-            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords);
+            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords, _extend_parent);
             break;
         case TargetHint::NEON:
-            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords);
+            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords, _extend_parent);
             break;
         default:
             ARM_COMPUTE_ERROR("Invalid TargetHint");

diff --git a/src/graph/nodes/BranchLayer.cpp b/src/graph/nodes/BranchLayer.cpp
index d062e4b..7a20a56 100644
--- a/src/graph/nodes/BranchLayer.cpp
+++ b/src/graph/nodes/BranchLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,46 +37,6 @@
 
 using namespace arm_compute::graph;
 
-namespace
-{
-void depth_concatenate_output_info(ITensorInfo *info, ITensorInfo *sub_tensor_info)
-{
-    ARM_COMPUTE_ERROR_ON(info == nullptr);
-    ARM_COMPUTE_ERROR_ON(sub_tensor_info == nullptr);
-
-    TensorShape        info_shape            = info->tensor_shape();
-    const TensorShape &sub_tensor_info_shape = sub_tensor_info->tensor_shape();
-
-    // Update parent info and valid region
-    if(info_shape.total_size() == 0)
-    {
-        arm_compute::auto_init_if_empty(*info,
-                                        sub_tensor_info->tensor_shape(),
-                                        sub_tensor_info->num_channels(),
-                                        sub_tensor_info->data_type(), sub_tensor_info->fixed_point_position());
-        info->set_valid_region(sub_tensor_info->valid_region());
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(info->num_channels() != sub_tensor_info->num_channels());
-        ARM_COMPUTE_ERROR_ON(info->data_type() != sub_tensor_info->data_type());
-        ARM_COMPUTE_ERROR_ON(info->fixed_point_position() != sub_tensor_info->fixed_point_position());
-
-        // Concatenate depth
-        ARM_COMPUTE_ERROR_ON(info_shape.x() != sub_tensor_info_shape.x());
-        ARM_COMPUTE_ERROR_ON(info_shape.y() != sub_tensor_info_shape.y());
-        info_shape.set(2, info_shape.z() + sub_tensor_info_shape.z());
-        info->set_tensor_shape(info_shape);
-
-        // Update valid region
-        arm_compute::ValidRegion info_valid_region = info->valid_region();
-        info_valid_region.shape.set(2, info_shape.z());
-        arm_compute::ValidRegion updated_region = arm_compute::intersect_valid_regions(info_valid_region, sub_tensor_info->valid_region());
-        info->set_valid_region(updated_region);
-    }
-}
-} // namespace
-
 /** Branch function */
 class BranchFunction final : public arm_compute::IFunction
 {
@@ -117,9 +77,8 @@
     // Create branch function
     auto func = arm_compute::support::cpp14::make_unique<BranchFunction>();
 
-    // Track output SubTensorInfo and depth
-    TensorInfo out_info;
-    int        depth = 0;
+    // Track output depth
+    int depth = 0;
 
     // Constuct all sub-graphs given the input/output
     for(auto &sg : _sub_graphs)
@@ -143,15 +102,18 @@
         // Create output sub-tensor
         if(!sg->has_output())
         {
-            ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
-            out = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(output),
-                                                                      output->tensor()->info()->tensor_shape(),
-                                                                      Coordinates(0, 0, depth));
+            ARM_COMPUTE_ERROR_ON((dynamic_cast<Tensor *>(output) == nullptr) && (dynamic_cast<SubTensor *>(output) == nullptr));
+
+            out = arm_compute::support::cpp14::make_unique<SubTensor>(output->tensor(),
+                                                                      TensorShape(),
+                                                                      Coordinates(0, 0, depth),
+                                                                      output->target(),
+                                                                      true);
             out_sub_tensor = dynamic_cast<SubTensor *>(out.get());
         }
 
         // Construct sub_graph
-        auto g = sg->construct(ctx.hints().target_hint(), std::move(in), std::move(out));
+        auto g = sg->construct(ctx, std::move(in), std::move(out));
 
         // Register graph to function
         func->register_graph(std::move(g));
@@ -161,16 +123,8 @@
         {
             ARM_COMPUTE_ERROR_ON(out_sub_tensor->tensor() == nullptr);
             depth += out_sub_tensor->tensor()->info()->tensor_shape()[2];
-            depth_concatenate_output_info(&out_info, out_sub_tensor->tensor()->info());
         }
     }
 
-    // Auto-init output
-    arm_compute::auto_init_if_empty(*output->tensor()->info(),
-                                    out_info.tensor_shape(),
-                                    out_info.num_channels(),
-                                    out_info.data_type(),
-                                    out_info.fixed_point_position());
-
     return std::move(func);
 }
\ No newline at end of file

diff --git a/src/graph/nodes/ConvolutionLayer.cpp b/src/graph/nodes/ConvolutionLayer.cpp
index ae4a8d7..f292b89 100644
--- a/src/graph/nodes/ConvolutionLayer.cpp
+++ b/src/graph/nodes/ConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -106,13 +106,16 @@
                                                                         const WeightsInfo    &weights_info,
                                                                         ConvolutionMethodHint conv_method)
 {
-    if(conv_method == ConvolutionMethodHint::GEMM)
+    if((conv_method == ConvolutionMethodHint::DIRECT)
+       && arm_compute::CLDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
     {
-        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLDirectConvolutionLayer");
+        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
     }
     else
     {
-        return instantiate_direct_function<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
+        return instantiate_function<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, TargetHint::OPENCL>(input, weights, biases, output, conv_info, weights_info);
     }
 }
 
@@ -122,13 +125,16 @@
                                                                       const WeightsInfo    &weights_info,
                                                                       ConvolutionMethodHint conv_method)
 {
-    if(conv_method == ConvolutionMethodHint::GEMM)
+    if((conv_method == ConvolutionMethodHint::DIRECT)
+       && arm_compute::NEDirectConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info)) // NOLINT
     {
-        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDirectConvolutionLayer");
+        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
     }
     else
     {
-        return instantiate_direct_function<arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info);
+        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
+        return instantiate_function<arm_compute::NEConvolutionLayer, arm_compute::ITensor, TargetHint::NEON>(input, weights, biases, output, conv_info, weights_info);
     }
 }
 } // namespace
@@ -184,14 +190,17 @@
     // Set weights and biases info
     if(_weights.tensor() == nullptr)
     {
-        _weights.set_info(TensorInfo(TensorShape(_conv_width, _conv_height, in->info()->dimension(2) / _num_groups, _ofm),
+        TensorInfo info = TensorInfo(TensorShape(_conv_width, _conv_height, in->info()->dimension(2) / _num_groups, _ofm),
                                      in->info()->num_channels(),
                                      in->info()->data_type(),
-                                     in->info()->fixed_point_position()));
+                                     in->info()->fixed_point_position());
+        info.set_quantization_info(_weights_quant_info);
+        _weights.set_info(std::move(info));
     }
     if(_biases.has_accessor() && _biases.tensor() == nullptr)
     {
-        _biases.set_info(TensorInfo(TensorShape(_ofm), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+        DataType dt = in->info()->data_type();
+        _biases.set_info(TensorInfo(TensorShape(_ofm), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
     }
 
     std::unique_ptr<arm_compute::IFunction> func;
@@ -213,7 +222,8 @@
     TensorShape output_shape = calculate_convolution_layer_output_shape(in->info()->tensor_shape(), _weights.info().tensor_shape(), _conv_info);
 
     // Output auto inizialitation if not yet initialized
-    arm_compute::auto_init_if_empty(*out->info(), output_shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
+    arm_compute::auto_init_if_empty(*out->info(), output_shape, 1, in->info()->data_type(), in->info()->fixed_point_position(),
+                                    (_out_quant_info.empty()) ? in->info()->quantization_info() : _out_quant_info);
 
     // Create appropriate convolution function
     if(_num_groups == 1)
@@ -254,12 +264,10 @@
     std::unique_ptr<arm_compute::IFunction> func;
     if(_target_hint == TargetHint::OPENCL)
     {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
         func = instantiate<TargetHint::OPENCL>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
         func = instantiate<TargetHint::NEON>(input, _weights.tensor(), _biases.tensor(), output, _conv_info, _weights_info, conv_method_hint);
     }
     return func;
@@ -321,12 +329,10 @@
         // Instantiate convolution function
         if(_target_hint == TargetHint::OPENCL)
         {
-            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLConvolutionLayer");
             func = instantiate<TargetHint::OPENCL>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
         else
         {
-            ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEConvolutionLayer");
             func = instantiate<TargetHint::NEON>(_is[i].tensor(), _ws[i].tensor(), _bs[i].tensor(), _os[i].tensor(), _conv_info, _weights_info, conv_method_hint);
         }
 

diff --git a/src/graph/nodes/DepthwiseConvolutionLayer.cpp b/src/graph/nodes/DepthwiseConvolutionLayer.cpp
index b459853..1209d03 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayer.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,11 +43,14 @@
         TensorShape shape = in->info()->tensor_shape();
         shape.set(Window::DimX, _conv_width);
         shape.set(Window::DimY, _conv_height);
-        _weights.set_info(TensorInfo(TensorShape(shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+        TensorInfo info = TensorInfo(TensorShape(shape), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position());
+        info.set_quantization_info(_quant_info);
+        _weights.set_info(std::move(info));
     }
     if(_biases.has_accessor() && _biases.tensor() == nullptr)
     {
-        _biases.set_info(TensorInfo(TensorShape(in->info()->dimension(2)), in->info()->num_channels(), in->info()->data_type(), in->info()->fixed_point_position()));
+        DataType dt = in->info()->data_type();
+        _biases.set_info(TensorInfo(TensorShape(in->info()->dimension(2)), in->info()->num_channels(), is_data_type_quantized_asymmetric(dt) ? DataType::S32 : dt, in->info()->fixed_point_position()));
     }
 
     bool weights_is_loaded = _weights.tensor() != nullptr;

diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index bbe0739..b0c117e 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@
     arm_compute::ITensor *out = output->tensor();
 
     // Auto configure output
-    arm_compute::auto_init_if_empty(*out->info(), _shape, 1, in->info()->data_type(), in->info()->fixed_point_position());
+    arm_compute::auto_init_if_empty(*out->info(), _shape, 1, in->info()->data_type(), in->info()->fixed_point_position(), in->info()->quantization_info());
 
     // Create node context
     NodeContext node_ctx(OperationType::ReshapeLayer);

diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
index 8f2bf23..61315e7 100644
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ b/src/graph/operations/CLSimpleOperations.cpp

@@ -156,13 +156,13 @@
     bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
     if(run_3x3_opt)
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }
     else
     {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>();
+        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::CLDepthwiseConvolutionLayer>();
         depwthwise_conv->configure(in, weights, biases, out, conv_info);
         func = std::move(depwthwise_conv);
     }

diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
index bb99e8d..49adbe9 100644
--- a/src/graph/operations/NESimpleOperations.cpp
+++ b/src/graph/operations/NESimpleOperations.cpp

@@ -149,23 +149,12 @@
     auto      *biases    = ctx.num_inputs() == 3 ? dynamic_cast<arm_compute::ITensor *>(ctx.input(2)) : nullptr;
     auto      *out       = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
     const auto conv_info = ctx.parameter<PadStrideInfo>("ConvolutionInfo");
-    const auto opt3x3    = ctx.parameter<bool>("Optimized3x3");
 
     // Create and configure function
     std::unique_ptr<arm_compute::IFunction> func;
-    bool                                    run_3x3_opt = opt3x3 && weights->info()->dimension(0) == 3;
-    if(run_3x3_opt)
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
-    else
-    {
-        auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>();
-        depwthwise_conv->configure(in, weights, biases, out, conv_info);
-        func = std::move(depwthwise_conv);
-    }
+    auto depwthwise_conv = arm_compute::support::cpp14::make_unique<arm_compute::NEDepthwiseConvolutionLayer>();
+    depwthwise_conv->configure(in, weights, biases, out, conv_info);
+    func = std::move(depwthwise_conv);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEDepthwiseConvolutionLayer"
@@ -460,4 +449,4 @@
                                << std::endl);
 
     return std::move(smx);
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index 7cd5518..865f389 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp

@@ -61,6 +61,7 @@
     const bool  is_orb_scale = (SCALE_PYRAMID_ORB == _info.scale());
     TensorShape tensor_shape = _info.tensor_shape();
 
+    // Note: Look-up table used by the OpenVX sample implementation
     const std::array<float, 4> c_orbscale =
     {
         {

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 71a749f..65292fe 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,8 @@
 
 using namespace arm_compute;
 
+std::once_flag CLScheduler::_initialize_symbols;
+
 CLScheduler::CLScheduler()
     : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
@@ -35,6 +37,7 @@
 
 CLScheduler &CLScheduler::get()
 {
+    std::call_once(_initialize_symbols, opencl_is_available);
     static CLScheduler scheduler;
     return scheduler;
 }
@@ -59,4 +62,4 @@
     {
         _queue.flush();
     }
-}
\ No newline at end of file
+}

diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index b228c0a..5f58024 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,11 +29,11 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords, extend_parent);
     _parent = parent;
 }
 

diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index a9b0867..0131801 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,8 @@
 }
 
 template <unsigned int matrix_size>
-void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+void CLConvolutionSquare<matrix_size>::configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+                                                 uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(conv == nullptr);

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 0ed3351..b3af11e 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,6 @@
 
 void CLConvolutionLayerReshapeWeights::configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
 
     if(biases != nullptr)
@@ -82,13 +79,14 @@
     {
         _weights_reshape_kernel.configure(weights, biases_to_use, output);
     }
+
+    output->info()->set_quantization_info(weights->info()->quantization_info());
 }
 
 void CLConvolutionLayerReshapeWeights::run()
 {
     _memory_group.acquire();
 
-    cl::CommandQueue q = CLScheduler::get().queue();
     CLScheduler::get().enqueue(_weights_reshape_kernel);
     if(_transpose1xW)
     {
@@ -99,33 +97,49 @@
 }
 
 CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _output_col2im_kernel(),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false),
-      _are_weights_reshaped(false), _is_quantized(false)
+    : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _interleave_kernel(), _mm_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
+      _col2im_kernel(), _im2col_output(), _interleave_output(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _are_weights_reshaped(false), _is_quantized(false),
+      _is_interleaved_transposed(false)
 {
 }
 
-void CLConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed)
+void CLConvolutionLayer::configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed, bool are_weights_reshaped)
 {
     if(_is_quantized)
     {
-        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-        // Extract and negate input and weights offset
-        const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
-        const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
+        if(are_weights_reshaped)
+        {
+            ARM_COMPUTE_ERROR("Weights already reshaped are not suppported with gemmlowp");
+        }
+        else
+        {
+            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+            // Extract and negate input and weights offset
+            const QuantizationInfo input_quantization_info   = input->info()->quantization_info();
+            const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
 
-        input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
-        weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
+            input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.scale, -input_quantization_info.offset));
+            weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.scale, -weights_quantization_info.offset));
 
-        _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+            _mm_gemmlowp.configure(input, weights, output, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
 
-        // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
-        input->info()->set_quantization_info(input_quantization_info);
-        weights->info()->set_quantization_info(weights_quantization_info);
+            // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
+            input->info()->set_quantization_info(input_quantization_info);
+            weights->info()->set_quantization_info(weights_quantization_info);
+        }
     }
     else
     {
-        _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+        if(are_weights_reshaped)
+        {
+            // Configure matrix multiply kernel
+            _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+        }
+        else
+        {
+            // Configure matrix multiply function
+            _mm_gemm.configure(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/));
+        }
     }
 }
 
@@ -134,6 +148,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
+    ARM_COMPUTE_ERROR_ON(weights_info.are_reshaped() && CLScheduler::get().target() == GPUTarget::BIFROST);
     ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
     ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
     ARM_COMPUTE_ERROR_ON(weights_info.are_reshaped() && is_data_type_quantized_asymmetric(input->info()->data_type()));
@@ -157,14 +172,16 @@
 
     const DataType dt = input->info()->data_type();
 
-    // Set the GPU target for matrix multiply
+    // Set the GPU target for matrix multiply and im2col and col2im
     _mm_kernel.set_target(CLScheduler::get().target());
+    _im2col_kernel.set_target(CLScheduler::get().target());
+    _col2im_kernel.set_target(CLScheduler::get().target());
 
-    _append_bias          = (biases != nullptr) && (!_is_quantized);
-    _are_weights_reshaped = weights_info.are_reshaped();
+    const bool append_bias = (biases != nullptr) && (!_is_quantized);
+    _are_weights_reshaped  = weights_info.are_reshaped();
 
-    const unsigned   bias_element  = (_append_bias) ? 1 : 0;
-    const ICLTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+    const unsigned   bias_element  = (append_bias) ? 1 : 0;
+    const ICLTensor *biases_to_use = (append_bias) ? biases : nullptr;
 
     // Get parameters from conv_info
     unsigned int stride_x = 0;
@@ -181,8 +198,8 @@
                                                  conv_info);
 
     // Check if its a "fully connected" convolution
-    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
-    const bool run_interleaved      = (!_is_fully_connected_convolution && !_is_quantized);
+    const bool is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+    _is_interleaved_transposed                = (!is_fully_connected_convolution) && (!_is_quantized) && (_are_weights_reshaped);
 
     unsigned int mat_weights_cols = weights->info()->dimension(3);
     unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
@@ -190,7 +207,7 @@
     // Reshape weights if needed
     if(_are_weights_reshaped)
     {
-        if(_is_fully_connected_convolution || _is_quantized)
+        if(is_fully_connected_convolution || _is_quantized)
         {
             mat_weights_cols = weights->info()->dimension(0);
             mat_weights_rows = weights->info()->dimension(1);
@@ -204,22 +221,10 @@
     }
     else
     {
-        if(_is_fully_connected_convolution || _is_quantized)
-        {
-            // Create tensor to store the reshaped weights
-            TensorShape shape_wr(mat_weights_cols, mat_weights_rows);
-            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
-            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
-        }
-        else
-        {
-            // Create tensor to store transposed weights
-            const float transpose_width = 16.0f / input->info()->element_size();
-            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
-            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
-            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
-        }
-        _weights_reshaped.info()->set_quantization_info(weights->info()->quantization_info());
+        // _weights_reshaped will be auto configured in the kernel.
+        // Just append biases and do not transpose 1xW as it will be reshaped in CLGEMM
+        _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false);
+
         weights = &_weights_reshaped;
     }
 
@@ -230,50 +235,43 @@
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
+    //input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
     im2col_reshaped_info.set_quantization_info(input->info()->quantization_info());
-    _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create tensor (interleave) to prepare input tensor for GEMM
-    if(run_interleaved)
-    {
-        TensorShape shape_interleaved = shape_im2col;
-        shape_interleaved.set(0, shape_interleaved.x() * 4);
-        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
-        interleaved_info.set_quantization_info(input->info()->quantization_info());
-        _input_interleaved_reshaped.allocator()->init(interleaved_info);
-        _memory_group.manage(&_input_interleaved_reshaped);
-    }
+    _im2col_output.allocator()->init(im2col_reshaped_info);
+    _memory_group.manage(&_im2col_output);
 
     // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    TensorShape shape_gemm = _im2col_output.info()->tensor_shape();
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
     const DataType gemm_data_type = _is_quantized ? DataType::S32 : dt;
     // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input.
+    //input->clone() doesn't work with subtensors for grouped convolutions.
     TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
     info_gemm.set_quantization_info(output->info()->quantization_info());
     _gemm_output.allocator()->init(info_gemm);
     _memory_group.manage(&_gemm_output);
 
-    // Configure kernels
-    _input_im2col_kernel.set_target(CLScheduler::get().target());
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+    // Configure im2col
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, append_bias);
 
     // Configure matrix multiply
-    if(run_interleaved)
+    if(_is_interleaved_transposed)
     {
-        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
-        configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
-        _input_interleaved_reshaped.allocator()->allocate();
+        // Configure GEMMInterleave4x4. _input_interleaved_reshaped will be auto configured in the kernel
+        _interleave_kernel.configure(&_im2col_output, &_interleave_output);
+        _memory_group.manage(&_interleave_output);
+
+        // Configure GEMM
+        configure_mm(&_interleave_output, weights, &_gemm_output, true, _are_weights_reshaped);
+        _interleave_output.allocator()->allocate();
     }
     else
     {
-        configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
+        configure_mm(&_im2col_output, weights, &_gemm_output, false, _are_weights_reshaped);
     }
-    _input_im2col_reshaped.allocator()->allocate();
+    _im2col_output.allocator()->allocate();
 
     // Configure output stage for quantized case
     if(_is_quantized)
@@ -286,8 +284,7 @@
     }
 
     // Configure Col2Im
-    _output_col2im_kernel.set_target(CLScheduler::get().target());
-    _output_col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _col2im_kernel.configure(_is_quantized ? &_tmp_output : &_gemm_output, output, std::make_pair(conv_w, conv_h));
     if(_is_quantized)
     {
         _tmp_output.allocator()->allocate();
@@ -318,32 +315,39 @@
     _memory_group.acquire();
 
     // Run im2col
-    CLScheduler::get().enqueue(_input_im2col_kernel);
+    CLScheduler::get().enqueue(_im2col_kernel);
 
-    if(!_is_fully_connected_convolution && !_is_quantized)
+    // Note: _is_interleaved_transposed is true only if the weights passed to the function have been passed already reshaped
+    //       and if we do not have QASYMM8 data type. If this flag is true, we need to run the
+    //       gemm kernel instead of gemm function
+    if(_is_interleaved_transposed)
     {
-        // Run interleave4x4
-        CLScheduler::get().enqueue(_input_interleave_kernel);
-    }
+        // Run interleave4x4 kernel
+        CLScheduler::get().enqueue(_interleave_kernel);
 
-    // Runs matrix multiply on reshaped matrices
-    if(_is_quantized)
-    {
-        _mm_gemmlowp.run();
+        // Run matrix multiply kernel
+        CLScheduler::get().enqueue(_mm_kernel);
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_kernel);
-    }
+        // Runs CLGEMM or CLGEMMLowpMatrixMultiplyCore functions
+        if(_is_quantized)
+        {
+            // Run gemmlowp
+            _mm_gemmlowp.run();
 
-    // Run output stage for quantized case
-    if(_is_quantized)
-    {
-        _gemmlowp_output_stage.run();
+            // Run output stage
+            _gemmlowp_output_stage.run();
+        }
+        else
+        {
+            // Run gemm
+            _mm_gemm.run();
+        }
     }
 
     // Reshape output matrix
-    CLScheduler::get().enqueue(_output_col2im_kernel, false);
+    CLScheduler::get().enqueue(_col2im_kernel, false);
 
     _memory_group.release();
 }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
new file mode 100644
index 0000000..1c55722
--- /dev/null
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2017, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _scaled_output()
+{
+}
+
+Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                                      unsigned int inner_border_right, unsigned int inner_border_top)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) < 1);
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_right > stride_x - 1, "inner_border_right must be smaller than stride_x");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border_top > stride_y - 1, "inner_border_top must be smaller than stride_y");
+
+    auto out_dims = deconvolution_output_dimensions(input->dimension(0), input->dimension(1), weights->dimension(0), weights->dimension(1),
+                                                    info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
+
+    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->tensor_shape(), weights->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, bias);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_deconvolution_shape(*input, stride_x, stride_y, inner_border_right, inner_border_top,
+                                                                                                      info)));
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, BorderSize(inner_border_right, inner_border_top), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLDirectConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info));
+
+    return Status{};
+}
+
+void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                                     unsigned int inner_border_right, unsigned int inner_border_top)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+
+    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                    info.pad().first, info.pad().second, inner_border_top, inner_border_right, stride_x, stride_y);
+
+    const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
+
+    _memory_group.manage(&_scaled_output);
+
+    // configure scale function
+    // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+    TensorShape        scale_out_shape(input->info()->tensor_shape());
+    const unsigned int out_x = input->info()->dimension(0) + (input->info()->dimension(0) - 1) * (stride_x - 1) + inner_border_right + 2 * info.pad().first;
+    const unsigned int out_y = input->info()->dimension(1) + (input->info()->dimension(1) - 1) * (stride_y - 1) + inner_border_top + 2 * info.pad().second;
+    scale_out_shape.set(0, out_x);
+    scale_out_shape.set(1, out_y);
+    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    _scaled_output.allocator()->init(scale_out_info);
+
+    _scale_f.configure(input, &_scaled_output, BorderSize(inner_border_top, inner_border_right), info);
+
+    // setup the function to convolve the upscaled output
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
+    _scaled_output.allocator()->allocate();
+}
+
+void CLDeconvolutionLayer::run()
+{
+    _memory_group.acquire();
+    _scale_f.run();
+    _conv_f.run();
+    _memory_group.release();
+}

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
new file mode 100644
index 0000000..13a24f8
--- /dev/null
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017, 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
+    : _upsample(),
+      _output(nullptr)
+{
+}
+
+Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const BorderSize &inner_border,
+                                              const PadStrideInfo &info)
+{
+    return CLDeconvolutionLayerUpsampleKernel::validate(input, output, inner_border, info);
+}
+
+void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
+                                             const PadStrideInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _output = output;
+    _upsample.configure(input, _output, inner_border, info);
+}
+
+void CLDeconvolutionLayerUpsample::run()
+{
+    _output->map(CLScheduler::get().queue(), true);
+    memset(_output->buffer(), 0, _output->info()->total_size());
+    _output->unmap(CLScheduler::get().queue());
+
+    CLScheduler::get().enqueue(_upsample, false);
+}

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 7fd81cd..68c6576 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
@@ -32,6 +33,34 @@
 #include <algorithm>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output, bool is_interleaved_transposed)
+{
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    if(is_data_type_quantized_asymmetric(input.data_type()))
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info(input.quantization_info().scale, -input.quantization_info().offset);
+        const QuantizationInfo weights_quantization_info(weights.quantization_info().scale, -weights.quantization_info().offset);
+
+        // Validate gemmlowp function
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input.clone()->set_quantization_info(input_quantization_info),
+                                                                           &weights.clone()->set_quantization_info(weights_quantization_info),
+                                                                           &output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target));
+    }
+
+    return Status{};
+}
+} // namespace
 
 void CLFullyConnectedLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
@@ -40,6 +69,11 @@
     _kernel = std::move(k);
 }
 
+Status CLFullyConnectedLayerReshapeWeights::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return CLTransposeKernel::validate(input, output);
+}
+
 CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _accumulate_biases_kernel(), _im2col_output(),
       _gemmlowp_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false), _is_quantized(false)
@@ -80,8 +114,7 @@
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col = input->info()->tensor_shape();
-    shape_im2col.collapse(3);
+    TensorShape shape_im2col = compute_im2col_shape(*input->info());
     _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
@@ -105,9 +138,15 @@
 
 void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights, bool are_weights_reshaped)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
+                                                               weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(),
+                                                               transpose_weights,
+                                                               are_weights_reshaped));
 
     _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
     _is_fc_after_conv     = true;
@@ -192,6 +231,86 @@
     }
 }
 
+Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights, bool are_weights_reshaped)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+
+    bool            weights_reshaped = transpose_weights ? are_weights_reshaped : true;
+    bool            is_fc_after_conv = true;
+    bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
+    const GPUTarget gpu_target       = CLScheduler::get().target();
+
+    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(*input)));
+    const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
+
+    // Configure accumulate biases kernel for non quantized asymmetric types
+    if(biases != nullptr && !is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAccumulateBiasesKernel::validate(output, biases, gpu_target));
+    }
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *input_to_use   = input;
+    const ITensorInfo *weights_to_use = weights;
+    const ITensorInfo *tmp_output     = (is_quantized) ? &gemmlowp_output : output;
+
+    if(!weights_reshaped)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    // Check if we have a fully connected layer with batches
+    const bool is_batched_fc_layer = output->dimension(1) > 1;
+
+    if(is_batched_fc_layer)
+    {
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
+                                                                                 input->tensor_shape().cend(),
+                                                                                 output->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        is_fc_after_conv = input->num_dimensions() > 1;
+    }
+
+    if(is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
+
+        // Validate im2col kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &im2col_input, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false));
+        input_to_use = &im2col_input;
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
+    }
+    // Validate matrix multiply kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output, false));
+
+    // Validate output stage for asymmetric quantized types
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(&gemmlowp_output, biases, output));
+    }
+
+    return Status{};
+}
+
 void CLFullyConnectedLayer::run()
 {
     // Reshape of the weights (happens only once)

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index ca0228f..c676a10 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,20 +39,23 @@
 using namespace arm_compute;
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
+    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
+      _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta)
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
     if(c != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
+        ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix");
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
@@ -60,7 +63,11 @@
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
     // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
+    // For Bifrost architectures we do not reshape the input matrices
+    _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -73,31 +80,17 @@
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
-        const unsigned int transpose_w = max_cl_vector_width / data_size_from_type(b->info()->data_type());
-        shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position());
-        _tmp_a.allocator()->init(info_a);
-
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position());
-        _tmp_b.allocator()->init(info_b);
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        _memory_group.manage(&_tmp_b);
+        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
         _interleave_kernel.configure(a, &_tmp_a);
 
         // Configure transpose kernel
         _transpose_kernel.configure(b, &_tmp_b);
+
+        // Manage intermediate buffers
+        _memory_group.manage(&_tmp_a);
+        _memory_group.manage(&_tmp_b);
     }
 
     _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
@@ -126,8 +119,18 @@
         // Run interleave kernel
         CLScheduler::get().enqueue(_interleave_kernel, false);
 
-        // Run transpose kernel
-        CLScheduler::get().enqueue(_transpose_kernel, false);
+        if(_is_first_run)
+        {
+            // Run transpose kernel
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+
+            _is_first_run = false;
+        }
+        else if(!_reshape_b_only_on_first_run)
+        {
+            // Run transpose kernel
+            CLScheduler::get().enqueue(_transpose_kernel, false);
+        }
     }
 
     // Run matrix multiply kernel

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 5c6f5b4..ddcab6a 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -29,9 +29,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
@@ -41,14 +43,9 @@
 
 void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_UNUSED(gemm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->info()->quantization_info().offset;
@@ -65,18 +62,8 @@
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
-        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        TensorShape shape_tmp_a = a->info()->tensor_shape();
-        shape_tmp_a.set(0, a->info()->dimension(0) * 4);
-        shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
-        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-        TensorShape shape_tmp_b = b->info()->tensor_shape();
-        shape_tmp_b.set(0, b->info()->dimension(1) * 16);
-        shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
-        TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
-        TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+        TensorInfo info_a(compute_interleaved_shape(*a->info()), 1, a->info()->data_type());
+        TensorInfo info_b(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type());
         _tmp_a.allocator()->init(info_a);
         _tmp_b.allocator()->init(info_b);
         _memory_group.manage(&_tmp_a);
@@ -95,13 +82,7 @@
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
     if(_a_offset != 0)
     {
-        TensorShape shape_vector_sum_col = b->info()->tensor_shape();
-
-        if(shape_vector_sum_col.num_dimensions() > 1)
-        {
-            shape_vector_sum_col.remove_dimension(1);
-        }
-        TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+        TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
         _vector_sum_col.allocator()->init(info_vector_sum_col);
         _memory_group.manage(&_vector_sum_col);
 
@@ -112,13 +93,7 @@
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
     if(_b_offset != 0)
     {
-        TensorShape shape_vector_sum_row = a->info()->tensor_shape();
-        shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
-        if(a->info()->num_dimensions() > 1)
-        {
-            shape_vector_sum_row.remove_dimension(1);
-        }
-        TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+        TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
         _vector_sum_row.allocator()->init(info_vector_sum_row);
         _memory_group.manage(&_vector_sum_row);
 
@@ -147,6 +122,67 @@
     }
 }
 
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+                                    "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+                                    "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    int32_t a_offset                  = a->quantization_info().offset;
+    int32_t b_offset                  = b->quantization_info().offset;
+    bool    is_interleaved_transposed = a->dimension(1) > 16;
+
+    if(is_interleaved_transposed)
+    {
+        TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type());
+        TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(a, b, output));
+    }
+
+    TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if(a_offset != 0)
+    {
+        info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(b_offset != 0)
+    {
+        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row));
+    }
+
+    // Validate offset contribution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionKernel::validate(output,
+                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                             a_offset, b_offset));
+
+    return Status{};
+}
+
 void CLGEMMLowpMatrixMultiplyCore::run()
 {
     _memory_group.acquire();

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 059528f..423faea 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,9 @@
 
 void CLHarrisCorners::configure(ICLImage *input, float threshold, float min_dist,
                                 float sensitivity, int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                                BorderMode border_mode, uint8_t constant_border_value)
+                                BorderMode border_mode, uint8_t constant_border_value, bool use_fp16)
 {
+    ARM_COMPUTE_UNUSED(use_fp16);
     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));

diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index 9d6ac7a..b1284db 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp

@@ -32,7 +32,7 @@
 
 void CLMagnitude::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type, bool use_fp16)
 {
-    ARM_COMPUTE_UNUSED(use_fp16); //TODO(COMPMID-644): Add half float support
+    ARM_COMPUTE_UNUSED(use_fp16);
 
     auto k = arm_compute::support::cpp14::make_unique<CLMagnitudePhaseKernel>();
     k->configure(input1, input2, output, nullptr, mag_type);

diff --git a/src/core/NEON/kernels/winograd/transforms.hpp b/src/runtime/CL/functions/CLPermute.cpp
similarity index 68%
rename from src/core/NEON/kernels/winograd/transforms.hpp
rename to src/runtime/CL/functions/CLPermute.cpp
index 8546ee9..f23e231 100644
--- a/src/core/NEON/kernels/winograd/transforms.hpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 
-#pragma once
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
+#include "support/ToolchainSupport.h"
 
-#include "transforms/input_2x2_3x3.hpp"
-#include "transforms/kernel_2x2_3x3.hpp"
-#include "transforms/output_2x2_3x3.hpp"
+using namespace arm_compute;
+
+void CLPermute::configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLPermuteKernel>();
+    k->configure(input, output, perm);
+    _kernel = std::move(k);
+}
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 2341633..201bf87 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,16 +40,14 @@
     k->configure(input, output, pool_info);
     _kernel = std::move(k);
 
-    // Configure border depending on operation required
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    // Quantize border in case data type is quantized asymmetric data type
-    uint32_t border_value = 0;
+    PixelValue zero_value(0.f);
     if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
     {
-        border_value = static_cast<uint32_t>(input->info()->quantization_info().quantize(0.f, RoundingPolicy::TO_NEAREST_UP));
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
     }
-
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(border_value));
+    _border_handler.configure(input, _kernel->border_size(), border_mode, zero_value);
 }
 
 Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)

diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index b2235ea..fcc8559 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp

@@ -28,11 +28,27 @@
 
 using namespace arm_compute;
 
-GCScheduler::GCScheduler() = default;
+std::once_flag GCScheduler::_initialize_symbols;
+
+GCScheduler::GCScheduler()
+    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT)
+{
+}
+
+GCScheduler::~GCScheduler()
+{
+    eglDestroyContext(_display, _context);
+    eglTerminate(_display);
+
+    _context = EGL_NO_CONTEXT;
+    _display = EGL_NO_DISPLAY;
+}
 
 void GCScheduler::default_init()
 {
-    GCKernelLibrary::get().init("./cs_shaders/");
+    setup_context();
+
+    GCKernelLibrary::get().init("./cs_shaders/", _display, _context);
 }
 
 void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
@@ -42,11 +58,12 @@
 
 GCScheduler &GCScheduler::get()
 {
+    std::call_once(_initialize_symbols, opengles31_is_available);
     static GCScheduler scheduler;
     return scheduler;
 }
 
-void GCScheduler::enqueue(IGCKernel &kernel, bool flush)
+void GCScheduler::dispatch(IGCKernel &kernel, bool flush)
 {
     kernel.run(kernel.window());
     if(flush)
@@ -55,7 +72,60 @@
     }
 }
 
-void GCScheduler::sync()
+void GCScheduler::memory_barrier()
 {
     ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
 }
+
+void GCScheduler::setup_context()
+{
+    EGLBoolean res;
+    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
+
+    res = eglInitialize(_display, nullptr, nullptr);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
+    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
+    ARM_COMPUTE_UNUSED(egl_extension_st);
+
+    const EGLint config_attribs[] =
+    {
+        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
+        EGL_NONE
+    };
+    EGLConfig cfg;
+    EGLint    count;
+
+    res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglBindAPI(EGL_OPENGL_ES_API);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
+
+    const EGLint attribs[] =
+    {
+        EGL_CONTEXT_CLIENT_VERSION, 3,
+        EGL_NONE
+    };
+    _context = eglCreateContext(_display,
+                                cfg,
+                                EGL_NO_CONTEXT,
+                                attribs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+
+    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
+
+    ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
+    ARM_COMPUTE_UNUSED(res);
+}

diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
index 199ee46..f2926b0 100644
--- a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
+++ b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp

@@ -38,7 +38,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured");
 
-    GCScheduler::get().enqueue(_border_handler, false);
-    GCScheduler::get().sync();
-    GCScheduler::get().enqueue(*_kernel);
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(*_kernel);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
new file mode 100755
index 0000000..b99716b
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp

@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h"
+
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+using namespace arm_compute;
+
+void GCArithmeticAddition::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCArithmeticAdditionKernel>();
+    k->configure(input1, input2, output, policy);
+    _kernel = std::move(k);
+}
+
+Status GCArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
+{
+    return GCArithmeticAdditionKernel::validate(input1, input2, output, policy);
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
index 2e546a6..99bdf43 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp

@@ -44,5 +44,5 @@
 
 void GCBatchNormalizationLayer::run()
 {
-    GCScheduler::get().enqueue(_norm_kernel, true);
+    GCScheduler::get().dispatch(_norm_kernel, true);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
new file mode 100644
index 0000000..5689722
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+#include <cmath>
+#include <memory>
+#include <tuple>
+
+using namespace arm_compute;
+
+GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
+    : _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
+{
+}
+
+void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose1xW)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type()));
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
+    const unsigned   bias_element  = (append_biases) ? 1 : 0;
+    const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
+
+    _transpose1xW = transpose1xW;
+
+    if(transpose1xW)
+    {
+        // Create tensor to store the reshaped weights
+        const unsigned int mat_weights_cols = weights->info()->dimension(3);
+        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
+        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
+        const DataType     dt                   = weights->info()->data_type();
+        const int          fixed_point_position = weights->info()->fixed_point_position();
+        TensorInfo         info_wr(shape_wr, 1, dt, fixed_point_position);
+
+        _weights_reshaped.allocator()->init(info_wr);
+        _weights_reshape_kernel.configure(weights, biases_to_use, &_weights_reshaped);
+        _weights_transposed_kernel.configure(&_weights_reshaped, output);
+        _weights_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        _weights_reshape_kernel.configure(weights, biases_to_use, output);
+    }
+}
+
+void GCConvolutionLayerReshapeWeights::run()
+{
+    GCScheduler::get().dispatch(_weights_reshape_kernel);
+    if(_transpose1xW)
+    {
+        GCScheduler::get().dispatch(_weights_transposed_kernel);
+    }
+}
+
+GCConvolutionLayer::GCConvolutionLayer()
+    : _reshape_weights(), _input_im2col_kernel(), _input_interleave_kernel(), _mm_kernel(), _output_col2im_kernel(), _fill_border(), _input_im2col_reshaped(), _input_interleaved_reshaped(),
+      _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
+{
+}
+
+void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output, bool is_interleaved_transposed)
+{
+    _mm_kernel.configure(input, weights, output, 1.f, is_interleaved_transposed);
+}
+
+void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
+    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
+        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
+    }
+
+    const DataType dt = input->info()->data_type();
+
+    _append_bias          = (biases != nullptr);
+    _are_weights_reshaped = weights_info.are_reshaped();
+
+    const unsigned   bias_element  = (_append_bias) ? 1 : 0;
+    const IGCTensor *biases_to_use = (_append_bias) ? biases : nullptr;
+
+    // Get parameters from conv_info
+    unsigned int stride_x = 0;
+    unsigned int stride_y = 0;
+    std::tie(stride_x, stride_y) = conv_info.stride();
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
+    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    // Check if its a "fully connected" convolution
+    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+    const bool run_interleaved      = (!_is_fully_connected_convolution);
+
+    unsigned int mat_weights_cols = weights->info()->dimension(3);
+    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
+
+    // Reshape weights if needed
+    if(_are_weights_reshaped)
+    {
+        if(_is_fully_connected_convolution)
+        {
+            mat_weights_cols = weights->info()->dimension(0);
+            mat_weights_rows = weights->info()->dimension(1);
+        }
+        else
+        {
+            mat_weights_cols                         = weights_info.num_kernels();
+            const unsigned int quarter_reshaped_cols = weights->info()->dimension(0) / 4;
+            mat_weights_rows                         = quarter_reshaped_cols + bias_element;
+        }
+    }
+    else
+    {
+        if(_is_fully_connected_convolution)
+        {
+            // Create tensor to store the reshaped weights
+            int num_elems_read_per_iteration_x = 1;
+            if(dt == DataType::F16)
+            {
+                num_elems_read_per_iteration_x = 2;
+            }
+            TensorShape shape_wr((ceil_to_multiple(mat_weights_cols, num_elems_read_per_iteration_x)), mat_weights_rows);
+            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wr));
+            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, false /* 1xW transpose */);
+        }
+        else
+        {
+            // Create tensor to store transposed weights
+            const float transpose_width = 16.0f / input->info()->element_size();
+            TensorShape shape_wt(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+            _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_wt));
+            _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped, true /* 1xW transpose */);
+        }
+        weights = &_weights_reshaped;
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->info()->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+
+    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt, input->info()->fixed_point_position());
+    _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
+
+    // Create tensor (interleave) to prepare input tensor for GEMM
+    if(run_interleaved)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+
+        // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+        TensorInfo interleaved_info(shape_interleaved, 1, dt, input->info()->fixed_point_position());
+        _input_interleaved_reshaped.allocator()->init(interleaved_info);
+    }
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    const DataType gemm_data_type = dt;
+
+    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
+    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type, input->info()->fixed_point_position());
+    _gemm_output.allocator()->init(info_gemm);
+
+    // Configure kernels
+    if(dt == DataType::F16)
+    {
+        BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
+        input->info()->extend_padding(border_size);
+        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue(0)); // for PAD of im2col fp16: consider it as border
+    }
+    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _append_bias);
+
+    // Configure matrix multiply
+    if(run_interleaved)
+    {
+        _input_interleave_kernel.configure(&_input_im2col_reshaped, &_input_interleaved_reshaped);
+        configure_mm(&_input_interleaved_reshaped, weights, &_gemm_output);
+        _input_interleaved_reshaped.allocator()->allocate();
+    }
+    else
+    {
+        configure_mm(&_input_im2col_reshaped, weights, &_gemm_output, false);
+    }
+    _input_im2col_reshaped.allocator()->allocate();
+
+    // Configure Col2Im
+    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _gemm_output.allocator()->allocate();
+
+    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    // Allocate intermediate tensor
+    if(!_are_weights_reshaped)
+    {
+        _weights_reshaped.allocator()->allocate();
+    }
+}
+
+void GCConvolutionLayer::run()
+{
+    // Run weights reshaping (Runs once for every configure)
+    if(!_are_weights_reshaped)
+    {
+        _are_weights_reshaped = true;
+        _reshape_weights.run();
+    }
+
+    // Run im2col
+    GCScheduler::get().dispatch(_fill_border);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_input_im2col_kernel);
+
+    if(!_is_fully_connected_convolution)
+    {
+        GCScheduler::get().memory_barrier();
+        // Run interleave4x4
+        GCScheduler::get().dispatch(_input_interleave_kernel);
+    }
+
+    GCScheduler::get().memory_barrier();
+    // Runs matrix multiply on reshaped matrices
+    GCScheduler::get().dispatch(_mm_kernel);
+
+    GCScheduler::get().memory_barrier();
+    // Reshape output matrix
+    GCScheduler::get().dispatch(_output_col2im_kernel, false);
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
index ee0b121..689d8be 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.cpp

@@ -63,7 +63,8 @@
 {
     for(unsigned i = 0; i < _num_inputs; i++)
     {
-        GCScheduler::get().enqueue(_border_handlers_vector[i], false);
-        GCScheduler::get().enqueue(_concat_kernels_vector[i], true);
+        GCScheduler::get().dispatch(_border_handlers_vector[i], false);
+        GCScheduler::get().memory_barrier();
+        GCScheduler::get().dispatch(_concat_kernels_vector[i], true);
     }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
new file mode 100644
index 0000000..ef65989
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h"
+
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
+    k->configure(input, weights, biases, output, conv_info);
+    _kernel = std::move(k);
+
+    // Configure border handler
+    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
index 032c2fd..6407464 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp

@@ -46,5 +46,5 @@
 
 void GCDropoutLayer::run()
 {
-    GCScheduler::get().enqueue(_dropout_kernel);
+    GCScheduler::get().dispatch(_dropout_kernel);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index 63cb40e..9e4f0f6 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
     _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
 
     // Configure im2col kernel
-    _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false);
+    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
 
     // Configure matrix multiply kernel
     _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
@@ -159,19 +159,22 @@
     // Linearize input if it comes from a convolutional layer
     if(_is_fc_after_conv)
     {
-        GCScheduler::get().enqueue(_im2col_kernel, false);
+        GCScheduler::get().dispatch(_im2col_kernel, false);
     }
 
-    GCScheduler::get().sync();
+    if(!_are_weights_reshaped || _is_fc_after_conv)
+    {
+        GCScheduler::get().memory_barrier();
+    }
 
     // Run matrix multiply
-    GCScheduler::get().enqueue(_mm_kernel, !_accumulate_biases);
+    GCScheduler::get().dispatch(_mm_kernel, !_accumulate_biases);
 
     // Accumulate biases if provided
     if(_accumulate_biases)
     {
-        GCScheduler::get().sync();
+        GCScheduler::get().memory_barrier();
 
-        GCScheduler::get().enqueue(_accumulate_biases_kernel);
+        GCScheduler::get().dispatch(_accumulate_biases_kernel);
     }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index c47a0e7..7aa2d42 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -38,6 +38,7 @@
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::gles_compute;
 
 GCGEMM::GCGEMM()
     : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false)
@@ -116,18 +117,20 @@
     if(_is_interleaved_transposed)
     {
         // Run interleave kernel
-        GCScheduler::get().enqueue(_interleave_kernel, false);
+        GCScheduler::get().dispatch(_interleave_kernel, false);
 
         // Run transpose kernel
-        GCScheduler::get().enqueue(_transpose_kernel, false);
+        GCScheduler::get().dispatch(_transpose_kernel, false);
+        GCScheduler::get().memory_barrier();
     }
 
     // Run matrix multiply kernel
-    GCScheduler::get().enqueue(_mm_kernel, !_run_addition);
+    GCScheduler::get().dispatch(_mm_kernel, !_run_addition);
 
     // Run matrix addition kernel
     if(_run_addition)
     {
-        GCScheduler::get().enqueue(_ma_kernel);
+        GCScheduler::get().memory_barrier();
+        GCScheduler::get().dispatch(_ma_kernel);
     }
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index d30ed52..fc3882d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -55,7 +55,9 @@
 
 void GCNormalizationLayer::run()
 {
-    GCScheduler::get().enqueue(_multiply_kernel, false);
-    GCScheduler::get().enqueue(_border_handler, false);
-    GCScheduler::get().enqueue(_norm_kernel, false);
+    GCScheduler::get().dispatch(_multiply_kernel, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_border_handler, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_norm_kernel, true);
 }

diff --git a/src/core/NEON/kernels/winograd/utils.hpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
old mode 100644
new mode 100755
similarity index 62%
copy from src/core/NEON/kernels/winograd/utils.hpp
copy to src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
index 14e709f..5fb971c
--- a/src/core/NEON/kernels/winograd/utils.hpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp

@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2017 ARM Limited.
  *
@@ -22,34 +21,28 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
-#include <ctime>
 
-inline double TimeInUs(void) {
-#ifdef CYCLE_PROFILING
-  timespec t;
-  clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
-  return 1e6*t.tv_sec + 1e-3*t.tv_nsec;
-#else
-  return 0;
-#endif
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+
+using namespace arm_compute;
+
+GCNormalizePlanarYUVLayer::GCNormalizePlanarYUVLayer()
+    : _norm_kernel()
+{
 }
 
-inline int iceildiv(const int a, const int b) {
-  return (a + b - 1) / b;
+void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *sd)
+{
+    _norm_kernel.configure(input, output, mean, sd);
 }
 
-template <typename T>
-inline T roundup(const T a, const T b) {
-  return a + b - (a % b);
-}
-
-inline void PrintMatrix(const float* const m, const int M, const int N, const int row_stride) {
-  for (int i = 0; i < M; i++) {
-    for (int j = 0; j < N; j++) {
-      printf("%.3f ", m[i*row_stride + j]);
-    }
-    printf("\n");
-  }
-  printf("\n");
+void GCNormalizePlanarYUVLayer::run()
+{
+    GCScheduler::get().dispatch(_norm_kernel, true);
 }

diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index 46a60cd..ff03eff 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
 
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
 #include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/core/PixelValue.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -40,3 +40,8 @@
     BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
     _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
 }
+
+Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return GCPoolingLayerKernel::validate(input, output, pool_info);
+}
\ No newline at end of file

diff --git a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
new file mode 100644
index 0000000..cfe65a3
--- /dev/null
+++ b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp

@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
+#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void GCScale::configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCScaleKernel>();
+    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+}

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 34464ff..5221c5c 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -63,9 +63,9 @@
 
 void GCSoftmaxLayer::run()
 {
-    GCScheduler::get().enqueue(_max_kernel, false);
-    GCScheduler::get().sync();
-    GCScheduler::get().enqueue(_shift_exp_sum_kernel, false);
-    GCScheduler::get().sync();
-    GCScheduler::get().enqueue(_norm_kernel);
+    GCScheduler::get().dispatch(_max_kernel, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
+    GCScheduler::get().memory_barrier();
+    GCScheduler::get().dispatch(_norm_kernel);
 }

diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index f10ffa6..b84dfd3 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,8 @@
 }
 
 template <unsigned int matrix_size>
-void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value)
+void NEConvolutionSquare<matrix_size>::configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode,
+                                                 uint8_t constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(conv == nullptr);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 25c639f..8f7d940 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -44,6 +44,16 @@
 
 namespace arm_compute
 {
+namespace
+{
+TensorShape get_reshaped_weights_shape(const ITensorInfo *weights, bool has_bias)
+{
+    const unsigned int mat_weights_cols = weights->dimension(3);
+    const unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+    return TensorShape(mat_weights_cols, mat_weights_rows);
+}
+} // namespace
+
 NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _weights_reshape_kernel(), _weights_transposed_kernel(), _weights_reshaped(), _transpose1xW(false)
 {
@@ -51,18 +61,12 @@
 
 void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
+                                                                          (biases != nullptr) ? biases->info() : nullptr,
+                                                                          output->info(),
+                                                                          transpose1xW));
 
     // Check if bias are present, if yes they will be embedded to the weights matrix
     const bool _has_bias = (biases != nullptr);
@@ -72,10 +76,7 @@
     if(transpose1xW)
     {
         // Create tensor to store the reshaped weights
-        const unsigned int mat_weights_cols = weights->info()->dimension(3);
-        const unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
-        TensorShape        shape_wr(mat_weights_cols, mat_weights_rows);
-        TensorInfo         info_wr(shape_wr, 1, weights->info()->data_type(), weights->info()->fixed_point_position());
+        TensorInfo info_wr = weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(get_reshaped_weights_shape(weights->info(), _has_bias));
 
         _weights_reshaped.allocator()->init(info_wr);
         _memory_group.manage(&_weights_reshaped);
@@ -91,6 +92,46 @@
     }
 }
 
+Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    // Check if bias are present, if yes they will be embedded to the weights matrix
+    const bool has_bias = (biases != nullptr);
+
+    // Checks performed when biases are present
+    if(has_bias)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if(transpose1xW)
+    {
+        TensorInfo weights_reshaped = weights->clone()->set_tensor_shape(get_reshaped_weights_shape(weights, has_bias));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, &weights_reshaped));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(&weights_reshaped, output));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEWeightsReshapeKernel::validate(weights, biases, output));
+    }
+
+    return Status{};
+}
+
 void NEConvolutionLayerReshapeWeights::run()
 {
     _memory_group.acquire();
@@ -105,6 +146,62 @@
     _memory_group.release();
 }
 
+namespace
+{
+TensorShape get_reshaped_weights_shape_conv(const ITensorInfo *weights, bool has_bias, bool is_fully_connected_convolution)
+{
+    unsigned int mat_weights_cols = weights->dimension(3);
+    unsigned int mat_weights_rows = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+
+    if(is_fully_connected_convolution)
+    {
+        // Create tensor to store the reshaped weights
+        return TensorShape(mat_weights_cols, mat_weights_rows);
+    }
+    else
+    {
+        // Create tensor to store transposed weights
+        const float transpose_width = 16.0f / weights->element_size();
+        return TensorShape(mat_weights_rows * static_cast<unsigned int>(transpose_width), static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)));
+    }
+}
+
+Status validate_and_initialize_values(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, DataType &dt,
+                                      bool &has_bias,
+                                      bool &are_weights_reshaped, unsigned int &kernel_width, unsigned int &kernel_height, bool &is_fully_connected_convolution, unsigned int &mat_weights_cols, unsigned int &mat_weights_rows,
+                                      unsigned int &conv_w, unsigned int &conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && weights->dimension(2) != input->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(!weights_info.are_reshaped() && biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    dt                   = input->data_type();
+    has_bias             = (biases != nullptr);
+    are_weights_reshaped = weights_info.are_reshaped();
+    kernel_width         = (are_weights_reshaped) ? weights_info.kernel_size().first : weights->dimension(0);
+    kernel_height        = (are_weights_reshaped) ? weights_info.kernel_size().second : weights->dimension(1);
+    mat_weights_cols     = weights->dimension(3);
+    mat_weights_rows     = weights->dimension(0) * weights->dimension(1) * weights->dimension(2) + (has_bias ? 1 : 0);
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_width, kernel_height,
+                                                 conv_info);
+
+    is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+
+    return Status{};
+}
+} // namespace
+
 NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_optimised_kernel(nullptr), _output_col2im_kernel(),
       _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(), _workspace(), _has_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false)
@@ -113,42 +210,25 @@
 
 void NEConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, weights);
-    ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
-        ARM_COMPUTE_ERROR_ON(!weights_info.are_reshaped() && biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    DataType     dt{};
+    unsigned int kernel_width     = 0;
+    unsigned int kernel_height    = 0;
+    unsigned int mat_weights_cols = 0;
+    unsigned int mat_weights_rows = 0;
+    unsigned int conv_w           = 0;
+    unsigned int conv_h           = 0;
 
-    const DataType dt                   = input->info()->data_type();
-    const int      fixed_point_position = input->info()->fixed_point_position();
+    Status status = validate_and_initialize_values(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), conv_info, weights_info, dt, _has_bias, _are_weights_reshaped,
+                                                   kernel_width, kernel_height,
+                                                   _is_fully_connected_convolution,
+                                                   mat_weights_cols, mat_weights_rows, conv_w, conv_h);
 
-    _has_bias             = (biases != nullptr);
-    _are_weights_reshaped = weights_info.are_reshaped();
+    ARM_COMPUTE_ERROR_THROW_ON(status);
 
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    const unsigned int kernel_width  = (_are_weights_reshaped) ? weights_info.kernel_size().first : weights->info()->dimension(0);
-    const unsigned int kernel_height = (_are_weights_reshaped) ? weights_info.kernel_size().second : weights->info()->dimension(1);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info);
-
-    // Check if its a "fully connected" convolution, i.e. the output size is 1x1xnum_kernels
-    _is_fully_connected_convolution = ((conv_w == 1) && (conv_h == 1));
+    const unsigned int fixed_point_position = input->info()->fixed_point_position();
 
 #if defined(__arm__)
     if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
@@ -162,9 +242,6 @@
     }
 #endif /* defined(__arm__) || defined(__aarch64__) */
 
-    unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + (_has_bias ? 1 : 0);
-
     // Reshape weights if needed
     if(_mm_optimised_kernel != nullptr)
     {
@@ -230,7 +307,7 @@
     shape_im2col.set(0, mat_input_cols);
     shape_im2col.set(1, mat_input_rows);
     shape_im2col.set(2, 1);
-    _input_im2col_reshaped.allocator()->init(TensorInfo(shape_im2col, 1, dt, fixed_point_position));
+    _input_im2col_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
     _memory_group.manage(&_input_im2col_reshaped);
 
     // Create tensor (interleave) to prepare input tensor for GEMM
@@ -239,7 +316,7 @@
         TensorShape shape_interleaved(shape_im2col);
         shape_interleaved.set(0, shape_interleaved.x() * 4);
         shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
-        _input_interleaved_reshaped.allocator()->init(TensorInfo(shape_interleaved, 1, dt, fixed_point_position));
+        _input_interleaved_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_interleaved));
         _memory_group.manage(&_input_interleaved_reshaped);
     }
 
@@ -247,7 +324,7 @@
     TensorShape shape_gemm(_input_im2col_reshaped.info()->tensor_shape());
     shape_gemm.set(0, mat_weights_cols);
     shape_gemm.set(1, mat_input_rows);
-    _gemm_output.allocator()->init(TensorInfo(shape_gemm, 1, dt, fixed_point_position));
+    _gemm_output.allocator()->init(_input_im2col_reshaped.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_gemm));
     _memory_group.manage(&_gemm_output);
 
     // Configure kernels
@@ -273,14 +350,7 @@
         _memory_group.manage(&_workspace);
 
         // Configure matrix multiplication kernel
-        if(_is_fully_connected_convolution)
-        {
-            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace, 1.f, 0.f);
-        }
-        else
-        {
-            _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
-        }
+        _mm_optimised_kernel->configure(&_input_im2col_reshaped, weights, &_gemm_output, &_workspace);
 
         _workspace.allocator()->allocate();
     }
@@ -303,8 +373,6 @@
     _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
     _gemm_output.allocator()->allocate();
 
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-
     // Allocate intermediate tensor
     if(!_are_weights_reshaped)
     {
@@ -312,6 +380,128 @@
     }
 }
 
+Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                    const WeightsInfo &weights_info)
+{
+    DataType     dt{};
+    bool         has_bias{};
+    bool         are_weights_reshaped{};
+    bool         is_fully_connected_convolution{};
+    unsigned int kernel_width     = 0;
+    unsigned int kernel_height    = 0;
+    unsigned int mat_weights_cols = 0;
+    unsigned int mat_weights_rows = 0;
+    unsigned int conv_w           = 0;
+    unsigned int conv_h           = 0;
+
+    Status status = validate_and_initialize_values(input, weights, biases, conv_info, weights_info, dt, has_bias, are_weights_reshaped, kernel_width, kernel_height,
+                                                   is_fully_connected_convolution, mat_weights_cols, mat_weights_rows,
+                                                   conv_w, conv_h);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(status);
+
+    std::unique_ptr<ITensorInfo> reshaped_weights = weights->clone();
+    bool                         optimised_kernel = false;
+
+#if defined(__arm__)
+    if(NEScheduler::get().cpu_info().CPU == CPUTarget::ARMV7 && dt == DataType::F32)
+    {
+        optimised_kernel = true;
+    }
+#elif defined(__aarch64__)
+    if(NEScheduler::get().cpu_info().CPU >= CPUTarget::ARMV8 && dt == DataType::F32)
+    {
+        optimised_kernel = true;
+    }
+#endif /* defined(__arm__) || defined(__aarch64__) */
+
+    // Reshape weights if needed
+    if(optimised_kernel)
+    {
+        if(are_weights_reshaped)
+        {
+            mat_weights_cols = weights_info.num_kernels();
+            mat_weights_rows = weights->dimension(1);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape{ mat_weights_cols, mat_weights_rows };
+
+            // Create tensor to store the reshaped weights
+            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+            weights = reshaped_weights.get();
+        }
+    }
+    else
+    {
+        if(are_weights_reshaped)
+        {
+            const unsigned int transpose_width = 16 / input->element_size();
+            mat_weights_cols                   = weights_info.num_kernels();
+            mat_weights_rows                   = weights->dimension(0) / transpose_width + (has_bias ? 1 : 0);
+        }
+        else
+        {
+            TensorShape reshaped_weights_shape;
+
+            if(is_fully_connected_convolution)
+            {
+                reshaped_weights_shape = TensorShape{ mat_weights_cols, mat_weights_rows };
+            }
+            else
+            {
+                // Create tensor to store transposed weights
+                const float transpose_width = 16.0f / input->element_size();
+                reshaped_weights_shape      = TensorShape{ mat_weights_rows *static_cast<unsigned int>(transpose_width),
+                                                           static_cast<unsigned int>(std::ceil(mat_weights_cols / transpose_width)) };
+            }
+
+            // Create tensor to store the reshaped weights
+            reshaped_weights->set_tensor_shape(get_reshaped_weights_shape_conv(weights, has_bias, is_fully_connected_convolution));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, biases, reshaped_weights.get(), !is_fully_connected_convolution /* 1xW transpose */));
+            weights = reshaped_weights.get();
+        }
+    }
+
+    // Validate im2col
+    const unsigned int mat_input_cols = mat_weights_rows;
+    const unsigned int mat_input_rows = conv_w * conv_h;
+    TensorShape        shape_im2col   = input->tensor_shape();
+    shape_im2col.set(0, mat_input_cols);
+    shape_im2col.set(1, mat_input_rows);
+    shape_im2col.set(2, 1);
+    TensorInfo im2_col_info = input->clone()->set_tensor_shape(shape_im2col);
+    ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2_col_info, Size2D(weights->dimension(0), weights->dimension(1)), conv_info, has_bias));
+
+    // Create GEMM output tensor
+    TensorShape shape_gemm(im2_col_info.tensor_shape());
+    shape_gemm.set(0, mat_weights_cols);
+    shape_gemm.set(1, mat_input_rows);
+    TensorInfo gemm_output_info = input->clone()->set_tensor_shape(shape_gemm);
+
+    // Validate GEMM interleave and multiply
+    if(!is_fully_connected_convolution)
+    {
+        TensorShape shape_interleaved = shape_im2col;
+        shape_interleaved.set(0, shape_interleaved.x() * 4);
+        shape_interleaved.set(1, std::ceil(shape_interleaved.y() / 4.f));
+        TensorInfo input_interleaved_info = input->clone()->set_tensor_shape(shape_interleaved);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(&im2_col_info, &input_interleaved_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&input_interleaved_info, weights, &gemm_output_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(&im2_col_info, weights, &gemm_output_info));
+    }
+
+    ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != conv_w) || (output->dimension(1) != conv_h), "Output shape does not match the expected one");
+
+    return Status{};
+}
+
 void NEConvolutionLayer::run()
 {
     // Run weights reshaping (Runs once for every configure)

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 7b4e77b..7bce8a6 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,39 +24,43 @@
 #include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
-      _scale_f(),
       _conv_f(),
-      _scaled_output()
+      _scaled_output(),
+      _input(nullptr),
+      _info(),
+      _inner_border()
 {
 }
 
 void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info,
-                                     unsigned int ax, unsigned int ay, float upscalex, float upscaley)
+                                     unsigned int inner_border_right, unsigned int inner_border_top)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) < 1);
+    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 1 && weights->info()->dimension(0) != 3 && weights->info()->dimension(0) != 5);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
-                                                    info.pad().first, info.pad().second, ax, ay, upscalex, upscaley, info.round());
+    _input        = input;
+    _info         = info;
+    _inner_border = std::make_pair(inner_border_right, inner_border_top);
+
+    const unsigned int stride_x = info.stride().first;
+    const unsigned int stride_y = info.stride().second;
+    auto               out_dims = deconvolution_output_dimensions(input->info()->dimension(0), input->info()->dimension(1), weights->info()->dimension(0), weights->info()->dimension(1),
+                                                                  info.pad().first, info.pad().second, inner_border_right, inner_border_top, stride_x, stride_y);
 
     const TensorShape output_shape = deconvolution_output_shape(out_dims, input->info()->tensor_shape(), weights->info()->tensor_shape());
 
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights, bias);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, weights, bias);
-
+    ARM_COMPUTE_UNUSED(output_shape);
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
     ARM_COMPUTE_ERROR_ON_MSG(output->info()->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
@@ -64,51 +68,51 @@
     _memory_group.manage(&_scaled_output);
 
     // configure scale function
-    //Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
-    TensorShape scale_out_shape(input->info()->tensor_shape());
-    scale_out_shape.set(0, output->info()->dimension(0));
-    scale_out_shape.set(1, output->info()->dimension(1));
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    // Init and allocate intermmidiate tensor for output, same size as input but the first two axis are the same as the output tensor
+    const TensorInfo scale_out_info(compute_deconvolution_shape(*input->info(), stride_x, stride_y, inner_border_right, inner_border_top, info), 1, input->info()->data_type(),
+                                    input->info()->fixed_point_position());
     _scaled_output.allocator()->init(scale_out_info);
-    const unsigned int kernel_size = weights->info()->dimension(0);
-    // Padding for the upsampled image is calculated with the equiation: p' = k - p - 1, where k is kernel size and p is the input padding
-    ARM_COMPUTE_ERROR_ON(info.pad().first > (kernel_size - 1));
-    const unsigned int  tr_px     = kernel_size - info.pad().first - 1;
-    const unsigned int  tr_py     = kernel_size - info.pad().second - 1;
-    const unsigned int  tr_stride = 1;
-    const PadStrideInfo transposed_info(tr_stride, tr_stride, tr_px, tr_py);
-    _scale_f.configure(input, &_scaled_output, std::make_pair(ax, ay), std::make_pair(info.stride().first - 1u, info.stride().second - 1u), transposed_info);
+
     // setup the function to convolve the upscaled output
-    switch(kernel_size)
-    {
-        case 1:
-        {
-            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
-            break;
-        }
-        case 3:
-        {
-            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
-            break;
-        }
-        case 5:
-        {
-            _conv_f.configure(&_scaled_output, weights, bias, output, PadStrideInfo(1, 1, 2, 2, DimensionRoundingType::CEIL));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-        }
-    }
+    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+    _conv_f.configure(&_scaled_output, weights, bias, output, conv_info);
     _scaled_output.allocator()->allocate();
 }
 
 void NEDeconvolutionLayer::run()
 {
     _memory_group.acquire();
-    _scale_f.run();
+
+    // Initialize _scaled_output buffer
+    const int width_in      = _input->info()->dimension(0);
+    const int height_in     = _input->info()->dimension(1);
+    const int width_scaled  = _scaled_output.info()->dimension(0);
+    const int height_scaled = _scaled_output.info()->dimension(1);
+    const int num_2d_slices = _input->info()->tensor_shape().total_size() / (width_in * height_in);
+    const int stride_x      = _info.stride().first;
+    const int stride_y      = _info.stride().second;
+
+    std::fill_n(reinterpret_cast<float *>(_scaled_output.buffer()), _scaled_output.info()->tensor_shape().total_size(), 0.f);
+
+    // scaled_output is the input for the forward convolution. We copy the input elements to scaled_output
+    // and insert rows and columns with zeroes depending on the stride values.
+    for(int slice = 0; slice < num_2d_slices; ++slice)
+    {
+        const int start_x = _info.pad().first;
+        const int start_y = _inner_border.second + _info.pad().second;
+        const int end_y   = height_scaled - _info.pad().second;
+        const int end_x   = width_scaled - _inner_border.first - _info.pad().first;
+
+        for(int yi = start_y, in_y = 0; yi < end_y; yi += stride_y, in_y++)
+        {
+            for(int xi = start_x, in_x = 0; xi < end_x; xi += stride_x, in_x++)
+            {
+                const auto in = *(reinterpret_cast<float *>(_input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(in_x, in_y, slice))));
+                *(reinterpret_cast<float *>(_scaled_output.buffer() + _scaled_output.info()->offset_element_in_bytes(Coordinates(xi, yi, slice)))) = in;
+            }
+        }
+    }
+
     _conv_f.run();
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
deleted file mode 100644
index 79b9b2d..0000000
--- a/src/runtime/NEON/functions/NEDeconvolutionLayerUpsample.cpp
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cmath>
-#include <cstddef>
-#include <utility>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void precompute_offsets(ITensor *offsets, float wr, size_t input_element_size, const std::pair<unsigned int, unsigned int> &a,
-                               const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == offsets);
-    Window    win;
-    const int padx          = info.pad().first;
-    const int pady          = info.pad().second;
-    const int ax            = a.first;
-    const int ay            = a.second;
-    const int offset_width  = offsets->info()->dimension(0);
-    const int offset_height = offsets->info()->dimension(1);
-    // The values of ax and ay denote the number of ZEROS to be added on the top and right inner border of the image.
-    // Step value along the XY axis will depend on the number of zeros to be inserted between samples (number of zeros + 1).
-    // Pre-compute the X offset, Y's stride is unknown at this point so we can't precompute Y's offsets
-    for(int yi = ay; yi < (offset_height - pady); yi += (1 + iz.second))
-    {
-        for(int xi = padx; xi < (offset_width - ax); xi += (1 + iz.first))
-        {
-            int         *ptr                  = reinterpret_cast<int *>(offsets->ptr_to_element(Coordinates(xi, yi)));
-            const size_t in_xi                = (xi + 0.5f) * wr;
-            *reinterpret_cast<int32_t *>(ptr) = in_xi * input_element_size;
-        }
-    }
-}
-} // namespace
-
-NEDeconvolutionLayerUpsample::NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _offsets(),
-      _border_handler(),
-      _upsample()
-{
-}
-
-void NEDeconvolutionLayerUpsample::configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a,
-                                             const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(0), output->info()->dimension(1));
-
-    // Compute the ratio between source width/height and destination width/height
-    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
-    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
-    ARM_COMPUTE_UNUSED(hr);
-    // Get the element size of the input image
-    const size_t input_element_size = input->info()->element_size();
-
-    TensorInfo tensor_info_offsets(shape, Format::S32);
-    _offsets.allocator()->init(tensor_info_offsets);
-
-    _upsample.configure(input, &_offsets, output);
-
-    // Allocate once the configure methods have been called
-    _offsets.allocator()->allocate();
-    // Pre-compute offsets for nearest interpolation
-    std::fill_n(reinterpret_cast<int32_t *>(_offsets.buffer()), _offsets.info()->total_size() / sizeof(int32_t), -1 * input_element_size);
-    precompute_offsets(&_offsets, wr, input_element_size, a, iz, info);
-
-    _border_handler.configure(input, _upsample.border_size(), BorderMode::CONSTANT, PixelValue(0.f));
-}
-
-void NEDeconvolutionLayerUpsample::run()
-{
-    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    _memory_group.acquire();
-    NEScheduler::get().schedule(&_upsample, Window::DimY);
-    _memory_group.release();
-}

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index b890c6f..2d08b45 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,28 +26,56 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
 NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
-    : _kernel(), _bias_kernel(), _border_handler(), _has_bias(false)
+    : _kernel(), _output_stage_kernel(), _border_handler(), _accumulator(), _has_bias(false), _is_quantized(false)
 {
 }
 
 void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
 
-    // Call convolution kernel
-    _kernel.configure(input, weights, output, conv_info);
-    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
-    if(biases != nullptr)
+    PixelValue zero_value(0.f);
+
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _has_bias     = biases != nullptr;
+
+    // Allocate the intermediate accumulator tensor in case of fixed point input
+    if(_is_quantized)
     {
-        _bias_kernel.configure(output, biases);
-        _has_bias = true;
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+        _accumulator.info()->set_quantization_info(input->info()->quantization_info());
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+    }
+
+    // Configure depthwise convolution kernel
+    _kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info);
+
+    // Configure border handler
+    _border_handler.configure(input, _kernel.border_size(), BorderMode::CONSTANT, zero_value);
+
+    // Configure biases accumulation
+    if(_has_bias || _is_quantized)
+    {
+        if(_is_quantized)
+        {
+            float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output->info()->quantization_info().scale;
+            int   output_multiplier, output_shift;
+            quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+            _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output->info()->quantization_info().offset);
+            _accumulator.allocator()->allocate();
+        }
+        else
+        {
+            _output_stage_kernel.configure(output, biases);
+        }
     }
 }
 
@@ -55,9 +83,9 @@
 {
     NEScheduler::get().schedule(&_border_handler, Window::DimX);
     NEScheduler::get().schedule(&_kernel, Window::DimX);
-    if(_has_bias)
+    if(_has_bias || _is_quantized)
     {
-        NEScheduler::get().schedule(&_bias_kernel, Window::DimX);
+        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
     }
 }
 

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index afa5d97..c26c99a 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
+    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false), _is_fixed_point(false)
 {
 }
 
@@ -50,16 +50,16 @@
     _has_bias = (bias != nullptr);
 
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    if(is_data_type_fixed_point(input->info()->data_type()))
+    _is_fixed_point = is_data_type_fixed_point(input->info()->data_type());
+    if(_is_fixed_point)
     {
         const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
         _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
         _memory_group.manage(&_accumulator);
         _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-        if(_has_bias)
-        {
-            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-        }
+
+        // When no bias is provided, we need to downscale the accumulator tensor
+        _output_stage_kernel.configure(&_accumulator, bias, output);
         _accumulator.allocator()->allocate();
     }
     else
@@ -67,7 +67,7 @@
         _conv_kernel.configure(input, weights, output, conv_info);
         if(_has_bias)
         {
-            _accumulate_bias_kernel.configure(output, bias);
+            _output_stage_kernel.configure(output, bias);
         }
     }
 
@@ -90,20 +90,17 @@
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
 
-    // Validate bias
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
-                                    "Biases should be provided for fixed point inputs");
     if(bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
                                         "Biases size and number of input feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
-
-        // Validate bias kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
     }
 
+    // Validate bias kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&accumulator, bias, output));
+
     return Status{};
 }
 
@@ -114,10 +111,9 @@
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
-    if(_has_bias)
+    if(_has_bias || _is_fixed_point)
     {
-        NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+        NEScheduler::get().schedule(&_output_stage_kernel, Window::DimY);
     }
-
     _memory_group.release();
 }

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 950f4c9..e640b06 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -50,15 +50,17 @@
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _mm_optimised_kernel(nullptr), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(),
-      _run_vector_matrix_multiplication(false), _run_addition(false)
+      _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
 {
 }
 
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta)
+void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16, DataType::QS8, DataType::QS16);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, d);
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
     if(c != nullptr)
     {
@@ -70,6 +72,8 @@
         ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != d->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix");
     }
 
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run      = gemm_info.reshape_b_only_on_first_run();
     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
 
     // Check if the first input tensor is a vector.
@@ -142,7 +146,7 @@
             _memory_group.manage(&_workspace);
 
             // Configure matrix multiplication kernel
-            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f);
+            _mm_optimised_kernel->configure(a, b, d, &_workspace, alpha, 0.f, false /* is_transposed_0 */, false /* is_transposed_1 */);
             _workspace.allocator()->allocate();
         }
         else
@@ -207,8 +211,18 @@
             // Run interleave kernel
             NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
 
-            // Run transpose kernel
-            NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            if(_is_first_run)
+            {
+                // Run transpose kernel
+                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+
+                _is_first_run = false;
+            }
+            else if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+            }
         }
 
         NEScheduler::get().schedule(&_mm_kernel, _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 6e03ffa..9b36e81 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -74,7 +74,7 @@
 #endif /* __aarch64__ */
 
 #ifdef ARM_COMPUTE_AARCH64_V8_2
-    if(ci.CPU == CPUTarget::A75_DOT)
+    if(ci.CPU == CPUTarget::A75_DOT || ci.CPU == CPUTarget::A55_DOT)
     {
         // Configure matrix multiply kernel
         GemmInterleaved<gemm_s8_12x8, int8_t, int32_t> gemm(&ci, M, N, K, false, false);
@@ -87,10 +87,6 @@
         _mm_kernel = std::move(k);
         _workspace.allocator()->allocate();
     }
-    else if(ci.CPU == CPUTarget::A55_DOT)
-    {
-        ARM_COMPUTE_ERROR_ON("WIP");
-    }
     else
 #elif defined(ARM_COMPUTE_AARCH64_V8A)
     if(ci.CPU == CPUTarget::A53)

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 50aa5b6..c4028dc 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -84,7 +84,7 @@
 
         // Configure matrix multiplication kernel
         auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpAArch64V8P4Kernel>();
-        k->configure(a, b, output, &_workspace, 1.f, 1.f);
+        k->configure(a, b, output, &_workspace, 1.f, 1.f, false, false);
         _mm_kernel = std::move(k);
     }
     else

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 530c7fc..8a32507 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,14 @@
     // Configure pooling kernel
     _pooling_layer_kernel.configure(input, output, pool_info);
 
-    // Configure border depending on operation required
+    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
     BorderMode border_mode = (pool_info.pool_type() == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, PixelValue(static_cast<float>(0.f)));
+    PixelValue zero_value(0.f);
+    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+    {
+        zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+    }
+    _border_handler.configure(input, _pooling_layer_kernel.border_size(), border_mode, zero_value);
 }
 
 Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)

diff --git a/src/runtime/NEON/functions/NEWinogradLayer.cpp b/src/runtime/NEON/functions/NEWinogradLayer.cpp
index 3251de4..da46f87 100644
--- a/src/runtime/NEON/functions/NEWinogradLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@
 namespace arm_compute
 {
 NEWinogradLayer::NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _weights_workspace(), _workspace(), _kernel_storage(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
+    : _memory_group(std::move(memory_manager)), _winograd_kernel(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(),
+      _output_nhwc(), _weights_hwio(), _input(), _weights(), _output(), _reshaped_kernel(false), _conv()
 {
 } /* arm_compute */
 
@@ -71,85 +72,107 @@
     ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides.");
 
     // Get convolved dimensions
-    auto      padding     = PADDING_VALID;
-    const int in_channels = input->info()->dimension(2);
+    const int in_channels  = input->info()->dimension(2);
+    const int out_channels = output->info()->dimension(2);
 
-    const int out_channels   = output->info()->dimension(2);
-    const int weights_width  = weights->info()->dimension(0);
-    const int weights_height = weights->info()->dimension(1);
-
-    const KernelShape   kernel_shape({ out_channels, weights_height, weights_width, in_channels });
     const Tensor4DShape in_shape(internal_get_input_shape(input));
 
     // Get the memory required to instantiate a new Winograd operator.
-    constexpr size_t kstore_alignment          = 64;
-    const size_t     kernel_storage_per_thread = NEWinogradLayerKernel::get_kernel_storage_size(kernel_shape);
-    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_per_thread + kstore_alignment - 1) }, 1, DataType::U8));
+    constexpr size_t storage_alignment   = 64;
+    const size_t     kernel_storage_size = NEWinogradLayerKernel::get_weight_storage_size(out_channels, in_channels) * sizeof(float);
+    _kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
     _memory_group.manage(&_kernel_storage);
-
-    // Get workbench size and allocate memory
-    constexpr size_t wspace_alignment = 64;
-    const size_t     ws_size          = NEWinogradLayerKernel::get_working_space_size(in_shape, kernel_shape, padding);
-    _workspace.allocator()->init(TensorInfo(TensorShape{ (ws_size + wspace_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_workspace);
-
-    // Workspace for weights transform
-    const size_t weights_transform_size = NEWinogradLayerKernel::get_kernel_transform_working_size(kernel_shape);
-    _weights_workspace.allocator()->init(TensorInfo(TensorShape{ (weights_transform_size + wspace_alignment - 1) }, 1, DataType::U8));
-    _memory_group.manage(&_weights_workspace);
-
+    _memory_group.manage(&_input_nhwc);
     _kernel_storage.allocator()->allocate();
-    _workspace.allocator()->allocate();
-    _weights_workspace.allocator()->allocate();
+    // Input storage
+    const size_t input_storage_size = NEWinogradLayerKernel::get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, false) * sizeof(float);
+    _input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_input_workspace);
+    _input_workspace.allocator()->allocate();
+
+    // Output storage
+    const size_t output_storage_size = NEWinogradLayerKernel::get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels, false) * sizeof(float);
+    _output_workspace.allocator()->init(TensorInfo(TensorShape{ (output_storage_size + storage_alignment - 1) }, 1, DataType::U8));
+    _memory_group.manage(&_output_workspace);
+    _output_workspace.allocator()->allocate();
+
+    // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
+    TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
+                                _output->info()->dimension(1), _output->info()->dimension(3)),
+                    1, _output->info()->data_type());
+    _output_nhwc.allocator()->init(info);
+
+    _output_nhwc.allocator()->allocate();
+
+    // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
+    switch(weights->info()->num_dimensions())
+    {
+        case 3:
+        {
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+            break;
+        }
+        case 4:
+        {
+            _permute_weights.configure(weights, &_weights_hwio, PermutationVector(3U, 2U, 0U, 1U));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+        }
+    }
+
+    _weights_hwio.allocator()->allocate();
+
+    // configure the kernel to transform the input tensor from NCHW -> NHWC
+    _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+
+    _input_nhwc.allocator()->allocate();
 
     // Create Winograd operator object
-    _conv = support::cpp14::make_unique<Winograd3x3F32>(kernel_shape, in_shape, padding, _kernel_storage.buffer());
+    _conv = support::cpp14::make_unique<Winograd3x3F32>(
+                in_shape.n_batches,
+                in_shape.n_channels,
+                in_shape.n_rows,
+                in_shape.n_cols,
+                out_channels,
+                false,
+                reinterpret_cast<const float *>(_weights_hwio.buffer()),
+                reinterpret_cast<float *>(_kernel_storage.buffer()),
+                reinterpret_cast<float *>(_input_nhwc.buffer()),
+                reinterpret_cast<float *>(_input_workspace.buffer()),
+                reinterpret_cast<float *>(_output_nhwc.buffer()),
+                reinterpret_cast<float *>(_output_workspace.buffer()));
 
     // Configure the kernel, padding not needed so it's safe to call configure after allocare
-    _winograd_kernel.configure(output, _conv.get());
+    _winograd_kernel.configure(_conv.get());
+
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
+
 }
 
 void NEWinogradLayer::run()
 {
-#if defined(__aarch64__)
     _memory_group.acquire();
     if(!_reshaped_kernel)
     {
-        _conv->transform_weights(reinterpret_cast<const float *>(_weights->buffer()), reinterpret_cast<float *>(_weights_workspace.buffer()));
         _reshaped_kernel = true;
+        _permute_weights.run();
+        _conv->transform_weights();
     }
-    const Tensor4DShape in_shape(internal_get_input_shape(_input));
-    auto                padding = PADDING_VALID;
-
     //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-    _conv->nchw2nhwc(in_shape, padding, _workspace.buffer(), reinterpret_cast<const float *>(_input->buffer()));
-
-    //Get ptrs into the workspace
-    std::pair<void *, void *> nhwc_ptrs = _conv->get_nhwc_ptrs(in_shape, padding, _workspace.buffer());
-
-    //Setup matrices ptrs and transfor the input tensor to the appropriate form before running GEMM.
-    _conv->reshape_input(in_shape, padding, nhwc_ptrs.second, _workspace.buffer());
-
+    _permute_input.run();
+    // Transform input tensor to the winograd domain
+    _conv->transform_input();
     //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
-    NEScheduler::get().schedule(&_winograd_kernel, Window::DimY);
-
-    //Transform the output to the appropriate form
-    _conv->reshape_output(in_shape, padding, nhwc_ptrs.first);
-
-    //Transform back to NCHW
-    _conv->nhwc2nchw(in_shape, padding, _workspace.buffer(), reinterpret_cast<float *>(_output->buffer()));
-
+    NEScheduler::get().schedule(&_winograd_kernel, Window::DimX);
+    // Transform output tensor to the spatial domain
+    _conv->transform_output();
+    // Reorder the convoluted output to ACL's ordering NCHW
+    _permute_output.run();
     _memory_group.release();
-#else  /* __aarch64__ */
-    ARM_COMPUTE_UNUSED(_winograd_kernel);
-    ARM_COMPUTE_UNUSED(_workspace);
-    ARM_COMPUTE_UNUSED(_kernel_storage);
-    ARM_COMPUTE_UNUSED(_input);
-    ARM_COMPUTE_UNUSED(_weights);
-    ARM_COMPUTE_UNUSED(_output);
-    ARM_COMPUTE_UNUSED(_reshaped_kernel);
-    ARM_COMPUTE_UNUSED(_conv);
-    ARM_COMPUTE_ERROR("Winograd only supported for aarch64, recompile with arch=arm64-v8a.");
-#endif /* __aarch64__ */
 }
 } // namespace arm_compute

diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index 32924be..c5b8f33 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,11 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords)
+SubTensor::SubTensor(ITensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
-    _info   = SubTensorInfo(parent->info(), tensor_shape, coords);
+    _info   = SubTensorInfo(parent->info(), tensor_shape, coords, extend_parent);
     _parent = parent;
 }
commit	f45d5a9be1bf4d315a227b80617582b8eb4214d2	[log] [tgz]
author	Anthony Barbier <anthony.barbier@arm.com>	Wed Jan 24 16:23:15 2018 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	Wed Jan 24 16:23:15 2018 +0000
tree	29f24fc5f51448e831080d76eef3ac75d43c1934
parent	6943bb00e79fe2ea4c127dc04b3440c5b0b29ce0 [diff]